LCOV - code coverage report
Current view: top level - src/backend/storage/buffer - bufmgr.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 91.2 % 2172 1981
Test Date: 2026-05-17 22:16:24 Functions: 93.7 % 142 133
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * bufmgr.c
       4              :  *    buffer manager interface routines
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/storage/buffer/bufmgr.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : /*
      16              :  * Principal entry points:
      17              :  *
      18              :  * ReadBuffer() -- find or create a buffer holding the requested page,
      19              :  *      and pin it so that no one can destroy it while this process
      20              :  *      is using it.
      21              :  *
      22              :  * StartReadBuffer() -- as above, with separate wait step
      23              :  * StartReadBuffers() -- multiple block version
      24              :  * WaitReadBuffers() -- second step of above
      25              :  *
      26              :  * ReleaseBuffer() -- unpin a buffer
      27              :  *
      28              :  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
      29              :  *      The disk write is delayed until buffer replacement or checkpoint.
      30              :  *
      31              :  * See also these files:
      32              :  *      freelist.c -- chooses victim for buffer replacement
      33              :  *      buf_table.c -- manages the buffer lookup table
      34              :  */
      35              : #include "postgres.h"
      36              : 
      37              : #include <sys/file.h>
      38              : #include <unistd.h>
      39              : 
      40              : #include "access/tableam.h"
      41              : #include "access/xloginsert.h"
      42              : #include "access/xlogutils.h"
      43              : #ifdef USE_ASSERT_CHECKING
      44              : #include "catalog/pg_tablespace_d.h"
      45              : #endif
      46              : #include "catalog/storage.h"
      47              : #include "catalog/storage_xlog.h"
      48              : #include "common/hashfn.h"
      49              : #include "executor/instrument.h"
      50              : #include "lib/binaryheap.h"
      51              : #include "miscadmin.h"
      52              : #include "pg_trace.h"
      53              : #include "pgstat.h"
      54              : #include "postmaster/bgwriter.h"
      55              : #include "storage/aio.h"
      56              : #include "storage/buf_internals.h"
      57              : #include "storage/bufmgr.h"
      58              : #include "storage/fd.h"
      59              : #include "storage/ipc.h"
      60              : #include "storage/lmgr.h"
      61              : #include "storage/proc.h"
      62              : #include "storage/proclist.h"
      63              : #include "storage/procsignal.h"
      64              : #include "storage/read_stream.h"
      65              : #include "storage/smgr.h"
      66              : #include "storage/standby.h"
      67              : #include "utils/memdebug.h"
      68              : #include "utils/ps_status.h"
      69              : #include "utils/rel.h"
      70              : #include "utils/resowner.h"
      71              : #include "utils/timestamp.h"
      72              : #include "utils/wait_event.h"
      73              : 
      74              : 
      75              : /* Note: these two macros only work on shared buffers, not local ones! */
      76              : #define BufHdrGetBlock(bufHdr)  ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
      77              : #define BufferGetLSN(bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))
      78              : 
      79              : /* Note: this macro only works on local buffers, not shared ones! */
      80              : #define LocalBufHdrGetBlock(bufHdr) \
      81              :     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
      82              : 
      83              : /* Bits in SyncOneBuffer's return value */
      84              : #define BUF_WRITTEN             0x01
      85              : #define BUF_REUSABLE            0x02
      86              : 
      87              : #define RELS_BSEARCH_THRESHOLD      20
      88              : 
      89              : /*
      90              :  * This is the size (in the number of blocks) above which we scan the
      91              :  * entire buffer pool to remove the buffers for all the pages of relation
      92              :  * being dropped. For the relations with size below this threshold, we find
      93              :  * the buffers by doing lookups in BufMapping table.
      94              :  */
      95              : #define BUF_DROP_FULL_SCAN_THRESHOLD        (uint64) (NBuffers / 32)
      96              : 
      97              : /*
      98              :  * This is separated out from PrivateRefCountEntry to allow for copying all
      99              :  * the data members via struct assignment.
     100              :  */
     101              : typedef struct PrivateRefCountData
     102              : {
     103              :     /*
     104              :      * How many times has the buffer been pinned by this backend.
     105              :      */
     106              :     int32       refcount;
     107              : 
     108              :     /*
     109              :      * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
     110              :      * the buffer is not locked.
     111              :      */
     112              :     BufferLockMode lockmode;
     113              : } PrivateRefCountData;
     114              : 
     115              : typedef struct PrivateRefCountEntry
     116              : {
     117              :     /*
     118              :      * Note that this needs to be same as the entry's corresponding
     119              :      * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
     120              :      * store it in both places as this is used for the hashtable key and
     121              :      * because it is more convenient (passing around a PrivateRefCountEntry
     122              :      * suffices to identify the buffer) and faster (checking the keys array is
     123              :      * faster when checking many entries, checking the entry is faster if just
     124              :      * checking a single entry).
     125              :      */
     126              :     Buffer      buffer;
     127              : 
     128              :     char        status;
     129              : 
     130              :     PrivateRefCountData data;
     131              : } PrivateRefCountEntry;
     132              : 
     133              : #define SH_PREFIX refcount
     134              : #define SH_ELEMENT_TYPE PrivateRefCountEntry
     135              : #define SH_KEY_TYPE Buffer
     136              : #define SH_KEY buffer
     137              : #define SH_HASH_KEY(tb, key) murmurhash32((uint32) (key))
     138              : #define SH_EQUAL(tb, a, b) ((a) == (b))
     139              : #define SH_SCOPE static inline
     140              : #define SH_DECLARE
     141              : #define SH_DEFINE
     142              : #include "lib/simplehash.h"
     143              : 
     144              : /* 64 bytes, about the size of a cache line on common systems */
     145              : #define REFCOUNT_ARRAY_ENTRIES 8
     146              : 
     147              : /*
     148              :  * Status of buffers to checkpoint for a particular tablespace, used
     149              :  * internally in BufferSync.
     150              :  */
     151              : typedef struct CkptTsStatus
     152              : {
     153              :     /* oid of the tablespace */
     154              :     Oid         tsId;
     155              : 
     156              :     /*
     157              :      * Checkpoint progress for this tablespace. To make progress comparable
     158              :      * between tablespaces the progress is, for each tablespace, measured as a
     159              :      * number between 0 and the total number of to-be-checkpointed pages. Each
     160              :      * page checkpointed in this tablespace increments this space's progress
     161              :      * by progress_slice.
     162              :      */
     163              :     float8      progress;
     164              :     float8      progress_slice;
     165              : 
     166              :     /* number of to-be checkpointed pages in this tablespace */
     167              :     int         num_to_scan;
     168              :     /* already processed pages in this tablespace */
     169              :     int         num_scanned;
     170              : 
     171              :     /* current offset in CkptBufferIds for this tablespace */
     172              :     int         index;
     173              : } CkptTsStatus;
     174              : 
     175              : /*
     176              :  * Type for array used to sort SMgrRelations
     177              :  *
     178              :  * FlushRelationsAllBuffers shares the same comparator function with
     179              :  * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
     180              :  * compatible.
     181              :  */
     182              : typedef struct SMgrSortArray
     183              : {
     184              :     RelFileLocator rlocator;    /* This must be the first member */
     185              :     SMgrRelation srel;
     186              : } SMgrSortArray;
     187              : 
     188              : /* GUC variables */
     189              : bool        zero_damaged_pages = false;
     190              : int         bgwriter_lru_maxpages = 100;
     191              : double      bgwriter_lru_multiplier = 2.0;
     192              : bool        track_io_timing = false;
     193              : 
     194              : /*
     195              :  * How many buffers PrefetchBuffer callers should try to stay ahead of their
     196              :  * ReadBuffer calls by.  Zero means "never prefetch".  This value is only used
     197              :  * for buffers not belonging to tablespaces that have their
     198              :  * effective_io_concurrency parameter set.
     199              :  */
     200              : int         effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
     201              : 
     202              : /*
     203              :  * Like effective_io_concurrency, but used by maintenance code paths that might
     204              :  * benefit from a higher setting because they work on behalf of many sessions.
     205              :  * Overridden by the tablespace setting of the same name.
     206              :  */
     207              : int         maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
     208              : 
     209              : /*
     210              :  * Limit on how many blocks should be handled in single I/O operations.
     211              :  * StartReadBuffers() callers should respect it, as should other operations
     212              :  * that call smgr APIs directly.  It is computed as the minimum of underlying
     213              :  * GUCs io_combine_limit_guc and io_max_combine_limit.
     214              :  */
     215              : int         io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
     216              : int         io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT;
     217              : int         io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
     218              : 
     219              : /*
     220              :  * GUC variables about triggering kernel writeback for buffers written; OS
     221              :  * dependent defaults are set via the GUC mechanism.
     222              :  */
     223              : int         checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
     224              : int         bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
     225              : int         backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
     226              : 
     227              : /* local state for LockBufferForCleanup */
     228              : static BufferDesc *PinCountWaitBuf = NULL;
     229              : 
     230              : /*
     231              :  * Backend-Private refcount management:
     232              :  *
     233              :  * Each buffer also has a private refcount that keeps track of the number of
     234              :  * times the buffer is pinned in the current process.  This is so that the
     235              :  * shared refcount needs to be modified only once if a buffer is pinned more
     236              :  * than once by an individual backend.  It's also used to check that no
     237              :  * buffers are still pinned at the end of transactions and when exiting. We
     238              :  * also use this mechanism to track whether this backend has a buffer locked,
     239              :  * and, if so, in what mode.
     240              :  *
     241              :  *
     242              :  * To avoid - as we used to - requiring an array with NBuffers entries to keep
     243              :  * track of local buffers, we use a small sequentially searched array
     244              :  * (PrivateRefCountArrayKeys, with the corresponding data stored in
     245              :  * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
     246              :  * keep track of backend local pins.
     247              :  *
     248              :  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
     249              :  * refcounts are kept track of in the array; after that, new array entries
     250              :  * displace old ones into the hash table. That way a frequently used entry
     251              :  * can't get "stuck" in the hashtable while infrequent ones clog the array.
     252              :  *
     253              :  * Note that in most scenarios the number of pinned buffers will not exceed
     254              :  * REFCOUNT_ARRAY_ENTRIES.
     255              :  *
     256              :  *
     257              :  * To enter a buffer into the refcount tracking mechanism first reserve a free
     258              :  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
     259              :  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
     260              :  * memory allocations in NewPrivateRefCountEntry() which can be important
     261              :  * because in some scenarios it's called with a spinlock held...
     262              :  */
     263              : static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES];
     264              : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
     265              : static refcount_hash *PrivateRefCountHash = NULL;
     266              : static int32 PrivateRefCountOverflowed = 0;
     267              : static uint32 PrivateRefCountClock = 0;
     268              : static int  ReservedRefCountSlot = -1;
     269              : static int  PrivateRefCountEntryLast = -1;
     270              : 
     271              : static uint32 MaxProportionalPins;
     272              : 
     273              : static void ReservePrivateRefCountEntry(void);
     274              : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
     275              : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
     276              : static inline int32 GetPrivateRefCount(Buffer buffer);
     277              : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
     278              : 
     279              : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
     280              : static void ResOwnerReleaseBufferIO(Datum res);
     281              : static char *ResOwnerPrintBufferIO(Datum res);
     282              : static void ResOwnerReleaseBuffer(Datum res);
     283              : static char *ResOwnerPrintBuffer(Datum res);
     284              : 
     285              : const ResourceOwnerDesc buffer_io_resowner_desc =
     286              : {
     287              :     .name = "buffer io",
     288              :     .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
     289              :     .release_priority = RELEASE_PRIO_BUFFER_IOS,
     290              :     .ReleaseResource = ResOwnerReleaseBufferIO,
     291              :     .DebugPrint = ResOwnerPrintBufferIO
     292              : };
     293              : 
     294              : const ResourceOwnerDesc buffer_resowner_desc =
     295              : {
     296              :     .name = "buffer",
     297              :     .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
     298              :     .release_priority = RELEASE_PRIO_BUFFER_PINS,
     299              :     .ReleaseResource = ResOwnerReleaseBuffer,
     300              :     .DebugPrint = ResOwnerPrintBuffer
     301              : };
     302              : 
     303              : /*
     304              :  * Ensure that the PrivateRefCountArray has sufficient space to store one more
     305              :  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
     306              :  * a new entry - but it's perfectly fine to not use a reserved entry.
     307              :  */
     308              : static void
     309     89013151 : ReservePrivateRefCountEntry(void)
     310              : {
     311              :     /* Already reserved (or freed), nothing to do */
     312     89013151 :     if (ReservedRefCountSlot != -1)
     313     83848864 :         return;
     314              : 
     315              :     /*
     316              :      * First search for a free entry the array, that'll be sufficient in the
     317              :      * majority of cases.
     318              :      */
     319              :     {
     320              :         int         i;
     321              : 
     322     46478583 :         for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
     323              :         {
     324     41314296 :             if (PrivateRefCountArrayKeys[i] == InvalidBuffer)
     325              :             {
     326     30176770 :                 ReservedRefCountSlot = i;
     327              : 
     328              :                 /*
     329              :                  * We could return immediately, but iterating till the end of
     330              :                  * the array allows compiler-autovectorization.
     331              :                  */
     332              :             }
     333              :         }
     334              : 
     335      5164287 :         if (ReservedRefCountSlot != -1)
     336      4973608 :             return;
     337              :     }
     338              : 
     339              :     /*
     340              :      * No luck. All array entries are full. Move one array entry into the hash
     341              :      * table.
     342              :      */
     343              :     {
     344              :         /*
     345              :          * Move entry from the current clock position in the array into the
     346              :          * hashtable. Use that slot.
     347              :          */
     348              :         int         victim_slot;
     349              :         PrivateRefCountEntry *victim_entry;
     350              :         PrivateRefCountEntry *hashent;
     351              :         bool        found;
     352              : 
     353              :         /* select victim slot */
     354       190679 :         victim_slot = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES;
     355       190679 :         victim_entry = &PrivateRefCountArray[victim_slot];
     356       190679 :         ReservedRefCountSlot = victim_slot;
     357              : 
     358              :         /* Better be used, otherwise we shouldn't get here. */
     359              :         Assert(PrivateRefCountArrayKeys[victim_slot] != InvalidBuffer);
     360              :         Assert(PrivateRefCountArray[victim_slot].buffer != InvalidBuffer);
     361              :         Assert(PrivateRefCountArrayKeys[victim_slot] == PrivateRefCountArray[victim_slot].buffer);
     362              : 
     363              :         /* enter victim array entry into hashtable */
     364       190679 :         hashent = refcount_insert(PrivateRefCountHash,
     365              :                                   PrivateRefCountArrayKeys[victim_slot],
     366              :                                   &found);
     367              :         Assert(!found);
     368              :         /* move data from the entry in the array to the hash entry */
     369       190679 :         hashent->data = victim_entry->data;
     370              : 
     371              :         /* clear the now free array slot */
     372       190679 :         PrivateRefCountArrayKeys[victim_slot] = InvalidBuffer;
     373       190679 :         victim_entry->buffer = InvalidBuffer;
     374              : 
     375              :         /* clear the whole data member, just for future proofing */
     376       190679 :         memset(&victim_entry->data, 0, sizeof(victim_entry->data));
     377       190679 :         victim_entry->data.refcount = 0;
     378       190679 :         victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
     379              : 
     380       190679 :         PrivateRefCountOverflowed++;
     381              :     }
     382              : }
     383              : 
     384              : /*
     385              :  * Fill a previously reserved refcount entry.
     386              :  */
     387              : static PrivateRefCountEntry *
     388     77278946 : NewPrivateRefCountEntry(Buffer buffer)
     389              : {
     390              :     PrivateRefCountEntry *res;
     391              : 
     392              :     /* only allowed to be called when a reservation has been made */
     393              :     Assert(ReservedRefCountSlot != -1);
     394              : 
     395              :     /* use up the reserved entry */
     396     77278946 :     res = &PrivateRefCountArray[ReservedRefCountSlot];
     397              : 
     398              :     /* and fill it */
     399     77278946 :     PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
     400     77278946 :     res->buffer = buffer;
     401     77278946 :     res->data.refcount = 0;
     402     77278946 :     res->data.lockmode = BUFFER_LOCK_UNLOCK;
     403              : 
     404              :     /* update cache for the next lookup */
     405     77278946 :     PrivateRefCountEntryLast = ReservedRefCountSlot;
     406              : 
     407     77278946 :     ReservedRefCountSlot = -1;
     408              : 
     409     77278946 :     return res;
     410              : }
     411              : 
     412              : /*
     413              :  * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
     414              :  * inlining. This particularly seems to be true if the compiler is capable of
     415              :  * auto-vectorizing the code, as that imposes additional stack-alignment
     416              :  * requirements etc.
     417              :  */
     418              : static pg_noinline PrivateRefCountEntry *
     419    107589734 : GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
     420              : {
     421              :     PrivateRefCountEntry *res;
     422    107589734 :     int         match = -1;
     423              :     int         i;
     424              : 
     425              :     /*
     426              :      * First search for references in the array, that'll be sufficient in the
     427              :      * majority of cases.
     428              :      */
     429    968307606 :     for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
     430              :     {
     431    860717872 :         if (PrivateRefCountArrayKeys[i] == buffer)
     432              :         {
     433     32678408 :             match = i;
     434              :             /* see ReservePrivateRefCountEntry() for why we don't return */
     435              :         }
     436              :     }
     437              : 
     438    107589734 :     if (likely(match != -1))
     439              :     {
     440              :         /* update cache for the next lookup */
     441     32678408 :         PrivateRefCountEntryLast = match;
     442              : 
     443     32678408 :         return &PrivateRefCountArray[match];
     444              :     }
     445              : 
     446              :     /*
     447              :      * By here we know that the buffer, if already pinned, isn't residing in
     448              :      * the array.
     449              :      *
     450              :      * Only look up the buffer in the hashtable if we've previously overflowed
     451              :      * into it.
     452              :      */
     453     74911326 :     if (PrivateRefCountOverflowed == 0)
     454     74307096 :         return NULL;
     455              : 
     456       604230 :     res = refcount_lookup(PrivateRefCountHash, buffer);
     457              : 
     458       604230 :     if (res == NULL)
     459       361507 :         return NULL;
     460       242723 :     else if (!do_move)
     461              :     {
     462              :         /* caller doesn't want us to move the hash entry into the array */
     463       144467 :         return res;
     464              :     }
     465              :     else
     466              :     {
     467              :         /* move buffer from hashtable into the free array slot */
     468              :         PrivateRefCountEntry *free;
     469              :         PrivateRefCountData data;
     470              : 
     471              :         /* Save data and delete from hashtable while res is still valid */
     472        98256 :         data = res->data;
     473        98256 :         refcount_delete_item(PrivateRefCountHash, res);
     474              :         Assert(PrivateRefCountOverflowed > 0);
     475        98256 :         PrivateRefCountOverflowed--;
     476              : 
     477              :         /* Ensure there's a free array slot */
     478        98256 :         ReservePrivateRefCountEntry();
     479              : 
     480              :         /* Use up the reserved slot */
     481              :         Assert(ReservedRefCountSlot != -1);
     482        98256 :         free = &PrivateRefCountArray[ReservedRefCountSlot];
     483              :         Assert(PrivateRefCountArrayKeys[ReservedRefCountSlot] == free->buffer);
     484              :         Assert(free->buffer == InvalidBuffer);
     485              : 
     486              :         /* and fill it */
     487        98256 :         free->buffer = buffer;
     488        98256 :         free->data = data;
     489        98256 :         PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
     490              :         /* update cache for the next lookup */
     491        98256 :         PrivateRefCountEntryLast = ReservedRefCountSlot;
     492              : 
     493        98256 :         ReservedRefCountSlot = -1;
     494              : 
     495        98256 :         return free;
     496              :     }
     497              : }
     498              : 
     499              : /*
     500              :  * Return the PrivateRefCount entry for the passed buffer.
     501              :  *
     502              :  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
     503              :  * do_move is true, and the entry resides in the hashtable the entry is
     504              :  * optimized for frequent access by moving it to the array.
     505              :  */
     506              : static inline PrivateRefCountEntry *
     507    442963207 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
     508              : {
     509              :     Assert(BufferIsValid(buffer));
     510              :     Assert(!BufferIsLocal(buffer));
     511              : 
     512              :     /*
     513              :      * It's very common to look up the same buffer repeatedly. To make that
     514              :      * fast, we have a one-entry cache.
     515              :      *
     516              :      * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
     517              :      * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
     518              :      * fewer addresses are computed and fewer cachelines are accessed. Whereas
     519              :      * in GetPrivateRefCountEntrySlow()'s case, checking
     520              :      * PrivateRefCountArrayKeys saves a lot of memory accesses.
     521              :      */
     522    442963207 :     if (likely(PrivateRefCountEntryLast != -1) &&
     523    442947153 :         likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer))
     524              :     {
     525    335373473 :         return &PrivateRefCountArray[PrivateRefCountEntryLast];
     526              :     }
     527              : 
     528              :     /*
     529              :      * The code for the cached lookup is small enough to be worth inlining
     530              :      * into the caller. In the miss case however, that empirically doesn't
     531              :      * seem worth it.
     532              :      */
     533    107589734 :     return GetPrivateRefCountEntrySlow(buffer, do_move);
     534              : }
     535              : 
     536              : /*
     537              :  * Returns how many times the passed buffer is pinned by this backend.
     538              :  *
     539              :  * Only works for shared memory buffers!
     540              :  */
     541              : static inline int32
     542      3348154 : GetPrivateRefCount(Buffer buffer)
     543              : {
     544              :     PrivateRefCountEntry *ref;
     545              : 
     546              :     Assert(BufferIsValid(buffer));
     547              :     Assert(!BufferIsLocal(buffer));
     548              : 
     549              :     /*
     550              :      * Not moving the entry - that's ok for the current users, but we might
     551              :      * want to change this one day.
     552              :      */
     553      3348154 :     ref = GetPrivateRefCountEntry(buffer, false);
     554              : 
     555      3348154 :     if (ref == NULL)
     556           31 :         return 0;
     557      3348123 :     return ref->data.refcount;
     558              : }
     559              : 
     560              : /*
     561              :  * Release resources used to track the reference count of a buffer which we no
     562              :  * longer have pinned and don't want to pin again immediately.
     563              :  */
     564              : static void
     565     77278946 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
     566              : {
     567              :     Assert(ref->data.refcount == 0);
     568              :     Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
     569              : 
     570     77278946 :     if (ref >= &PrivateRefCountArray[0] &&
     571              :         ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
     572              :     {
     573     77186523 :         ref->buffer = InvalidBuffer;
     574     77186523 :         PrivateRefCountArrayKeys[ref - PrivateRefCountArray] = InvalidBuffer;
     575              : 
     576              : 
     577              :         /*
     578              :          * Mark the just used entry as reserved - in many scenarios that
     579              :          * allows us to avoid ever having to search the array/hash for free
     580              :          * entries.
     581              :          */
     582     77186523 :         ReservedRefCountSlot = ref - PrivateRefCountArray;
     583              :     }
     584              :     else
     585              :     {
     586        92423 :         refcount_delete_item(PrivateRefCountHash, ref);
     587              :         Assert(PrivateRefCountOverflowed > 0);
     588        92423 :         PrivateRefCountOverflowed--;
     589              :     }
     590     77278946 : }
     591              : 
     592              : /*
     593              :  * BufferIsPinned
     594              :  *      True iff the buffer is pinned (also checks for valid buffer number).
     595              :  *
     596              :  *      NOTE: what we check here is that *this* backend holds a pin on
     597              :  *      the buffer.  We do not care whether some other backend does.
     598              :  */
     599              : #define BufferIsPinned(bufnum) \
     600              : ( \
     601              :     !BufferIsValid(bufnum) ? \
     602              :         false \
     603              :     : \
     604              :         BufferIsLocal(bufnum) ? \
     605              :             (LocalRefCount[-(bufnum) - 1] > 0) \
     606              :         : \
     607              :     (GetPrivateRefCount(bufnum) > 0) \
     608              : )
     609              : 
     610              : 
     611              : static Buffer ReadBuffer_common(Relation rel,
     612              :                                 SMgrRelation smgr, char smgr_persistence,
     613              :                                 ForkNumber forkNum, BlockNumber blockNum,
     614              :                                 ReadBufferMode mode, BufferAccessStrategy strategy);
     615              : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
     616              :                                            ForkNumber fork,
     617              :                                            BufferAccessStrategy strategy,
     618              :                                            uint32 flags,
     619              :                                            uint32 extend_by,
     620              :                                            BlockNumber extend_upto,
     621              :                                            Buffer *buffers,
     622              :                                            uint32 *extended_by);
     623              : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
     624              :                                            ForkNumber fork,
     625              :                                            BufferAccessStrategy strategy,
     626              :                                            uint32 flags,
     627              :                                            uint32 extend_by,
     628              :                                            BlockNumber extend_upto,
     629              :                                            Buffer *buffers,
     630              :                                            uint32 *extended_by);
     631              : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
     632              :                       bool skip_if_not_valid);
     633              : static void PinBuffer_Locked(BufferDesc *buf);
     634              : static void UnpinBuffer(BufferDesc *buf);
     635              : static void UnpinBufferNoOwner(BufferDesc *buf);
     636              : static void BufferSync(int flags);
     637              : static int  SyncOneBuffer(int buf_id, bool skip_recently_used,
     638              :                           WritebackContext *wb_context);
     639              : static void WaitIO(BufferDesc *buf);
     640              : static void AbortBufferIO(Buffer buffer);
     641              : static void shared_buffer_write_error_callback(void *arg);
     642              : static void local_buffer_write_error_callback(void *arg);
     643              : static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
     644              :                                       char relpersistence,
     645              :                                       ForkNumber forkNum,
     646              :                                       BlockNumber blockNum,
     647              :                                       BufferAccessStrategy strategy,
     648              :                                       bool *foundPtr, IOContext io_context);
     649              : static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
     650              : static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
     651              : 
     652              : static pg_attribute_always_inline void TrackBufferHit(IOObject io_object,
     653              :                                                       IOContext io_context,
     654              :                                                       Relation rel, char persistence, SMgrRelation smgr,
     655              :                                                       ForkNumber forknum, BlockNumber blocknum);
     656              : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
     657              : static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
     658              :                                 IOObject io_object, IOContext io_context);
     659              : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
     660              :                         IOObject io_object, IOContext io_context);
     661              : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
     662              :                                        ForkNumber forkNum,
     663              :                                        BlockNumber nForkBlock,
     664              :                                        BlockNumber firstDelBlock);
     665              : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
     666              :                                            RelFileLocator dstlocator,
     667              :                                            ForkNumber forkNum, bool permanent);
     668              : static void AtProcExit_Buffers(int code, Datum arg);
     669              : static void CheckForBufferLeaks(void);
     670              : #ifdef USE_ASSERT_CHECKING
     671              : static void AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode);
     672              : #endif
     673              : static int  rlocator_comparator(const void *p1, const void *p2);
     674              : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
     675              : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
     676              : static int  ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
     677              : 
     678              : static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
     679              : static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr);
     680              : static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
     681              : static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode);
     682              : static bool BufferLockHeldByMe(BufferDesc *buf_hdr);
     683              : static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
     684              : static inline int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr);
     685              : static inline bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode);
     686              : static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode);
     687              : static void BufferLockDequeueSelf(BufferDesc *buf_hdr);
     688              : static void BufferLockWakeup(BufferDesc *buf_hdr, bool wake_exclusive);
     689              : static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate);
     690              : static inline uint64 BufferLockReleaseSub(BufferLockMode mode);
     691              : 
     692              : 
     693              : /*
     694              :  * Implementation of PrefetchBuffer() for shared buffers.
     695              :  */
     696              : PrefetchBufferResult
     697        39587 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
     698              :                      ForkNumber forkNum,
     699              :                      BlockNumber blockNum)
     700              : {
     701        39587 :     PrefetchBufferResult result = {InvalidBuffer, false};
     702              :     BufferTag   newTag;         /* identity of requested block */
     703              :     uint32      newHash;        /* hash value for newTag */
     704              :     LWLock     *newPartitionLock;   /* buffer partition lock for it */
     705              :     int         buf_id;
     706              : 
     707              :     Assert(BlockNumberIsValid(blockNum));
     708              : 
     709              :     /* create a tag so we can lookup the buffer */
     710        39587 :     InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
     711              :                   forkNum, blockNum);
     712              : 
     713              :     /* determine its hash code and partition lock ID */
     714        39587 :     newHash = BufTableHashCode(&newTag);
     715        39587 :     newPartitionLock = BufMappingPartitionLock(newHash);
     716              : 
     717              :     /* see if the block is in the buffer pool already */
     718        39587 :     LWLockAcquire(newPartitionLock, LW_SHARED);
     719        39587 :     buf_id = BufTableLookup(&newTag, newHash);
     720        39587 :     LWLockRelease(newPartitionLock);
     721              : 
     722              :     /* If not in buffers, initiate prefetch */
     723        39587 :     if (buf_id < 0)
     724              :     {
     725              : #ifdef USE_PREFETCH
     726              :         /*
     727              :          * Try to initiate an asynchronous read.  This returns false in
     728              :          * recovery if the relation file doesn't exist.
     729              :          */
     730        18681 :         if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
     731         9214 :             smgrprefetch(smgr_reln, forkNum, blockNum, 1))
     732              :         {
     733         9214 :             result.initiated_io = true;
     734              :         }
     735              : #endif                          /* USE_PREFETCH */
     736              :     }
     737              :     else
     738              :     {
     739              :         /*
     740              :          * Report the buffer it was in at that time.  The caller may be able
     741              :          * to avoid a buffer table lookup, but it's not pinned and it must be
     742              :          * rechecked!
     743              :          */
     744        30120 :         result.recent_buffer = buf_id + 1;
     745              :     }
     746              : 
     747              :     /*
     748              :      * If the block *is* in buffers, we do nothing.  This is not really ideal:
     749              :      * the block might be just about to be evicted, which would be stupid
     750              :      * since we know we are going to need it soon.  But the only easy answer
     751              :      * is to bump the usage_count, which does not seem like a great solution:
     752              :      * when the caller does ultimately touch the block, usage_count would get
     753              :      * bumped again, resulting in too much favoritism for blocks that are
     754              :      * involved in a prefetch sequence. A real fix would involve some
     755              :      * additional per-buffer state, and it's not clear that there's enough of
     756              :      * a problem to justify that.
     757              :      */
     758              : 
     759        39587 :     return result;
     760              : }
     761              : 
     762              : /*
     763              :  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
     764              :  *
     765              :  * This is named by analogy to ReadBuffer but doesn't actually allocate a
     766              :  * buffer.  Instead it tries to ensure that a future ReadBuffer for the given
     767              :  * block will not be delayed by the I/O.  Prefetching is optional.
     768              :  *
     769              :  * There are three possible outcomes:
     770              :  *
     771              :  * 1.  If the block is already cached, the result includes a valid buffer that
     772              :  * could be used by the caller to avoid the need for a later buffer lookup, but
     773              :  * it's not pinned, so the caller must recheck it.
     774              :  *
     775              :  * 2.  If the kernel has been asked to initiate I/O, the initiated_io member is
     776              :  * true.  Currently there is no way to know if the data was already cached by
     777              :  * the kernel and therefore didn't really initiate I/O, and no way to know when
     778              :  * the I/O completes other than using synchronous ReadBuffer().
     779              :  *
     780              :  * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and
     781              :  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
     782              :  * lack of a kernel facility), direct I/O is enabled, or the underlying
     783              :  * relation file wasn't found and we are in recovery.  (If the relation file
     784              :  * wasn't found and we are not in recovery, an error is raised).
     785              :  */
     786              : PrefetchBufferResult
     787        28632 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
     788              : {
     789              :     Assert(RelationIsValid(reln));
     790              :     Assert(BlockNumberIsValid(blockNum));
     791              : 
     792        28632 :     if (RelationUsesLocalBuffers(reln))
     793              :     {
     794              :         /* see comments in ReadBuffer_common */
     795         1357 :         if (RELATION_IS_OTHER_TEMP(reln))
     796            0 :             ereport(ERROR,
     797              :                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     798              :                      errmsg("cannot access temporary tables of other sessions")));
     799              : 
     800              :         /* pass it off to localbuf.c */
     801         1357 :         return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
     802              :     }
     803              :     else
     804              :     {
     805              :         /* pass it to the shared buffer version */
     806        27275 :         return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
     807              :     }
     808              : }
     809              : 
     810              : /*
     811              :  * ReadRecentBuffer -- try to pin a block in a recently observed buffer
     812              :  *
     813              :  * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
     814              :  * successful.  Return true if the buffer is valid and still has the expected
     815              :  * tag.  In that case, the buffer is pinned and the usage count is bumped.
     816              :  */
     817              : bool
     818         5111 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
     819              :                  Buffer recent_buffer)
     820              : {
     821              :     BufferDesc *bufHdr;
     822              :     BufferTag   tag;
     823              :     uint64      buf_state;
     824              : 
     825              :     Assert(BufferIsValid(recent_buffer));
     826              : 
     827         5111 :     ResourceOwnerEnlarge(CurrentResourceOwner);
     828         5111 :     ReservePrivateRefCountEntry();
     829         5111 :     InitBufferTag(&tag, &rlocator, forkNum, blockNum);
     830              : 
     831         5111 :     if (BufferIsLocal(recent_buffer))
     832              :     {
     833          140 :         int         b = -recent_buffer - 1;
     834              : 
     835          140 :         bufHdr = GetLocalBufferDescriptor(b);
     836          140 :         buf_state = pg_atomic_read_u64(&bufHdr->state);
     837              : 
     838              :         /* Is it still valid and holding the right tag? */
     839          140 :         if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
     840              :         {
     841          140 :             PinLocalBuffer(bufHdr, true);
     842              : 
     843          140 :             pgBufferUsage.local_blks_hit++;
     844              : 
     845          140 :             return true;
     846              :         }
     847              :     }
     848              :     else
     849              :     {
     850         4971 :         bufHdr = GetBufferDescriptor(recent_buffer - 1);
     851              : 
     852              :         /*
     853              :          * Is it still valid and holding the right tag?  We do an unlocked tag
     854              :          * comparison first, to make it unlikely that we'll increment the
     855              :          * usage counter of the wrong buffer, if someone calls us with a very
     856              :          * out of date recent_buffer.  Then we'll check it again if we get the
     857              :          * pin.
     858              :          */
     859         9906 :         if (BufferTagsEqual(&tag, &bufHdr->tag) &&
     860         4935 :             PinBuffer(bufHdr, NULL, true))
     861              :         {
     862         4929 :             if (BufferTagsEqual(&tag, &bufHdr->tag))
     863              :             {
     864         4929 :                 pgBufferUsage.shared_blks_hit++;
     865         4929 :                 return true;
     866              :             }
     867            0 :             UnpinBuffer(bufHdr);
     868              :         }
     869              :     }
     870              : 
     871           42 :     return false;
     872              : }
     873              : 
     874              : /*
     875              :  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
     876              :  *      fork with RBM_NORMAL mode and default strategy.
     877              :  */
     878              : Buffer
     879     63063435 : ReadBuffer(Relation reln, BlockNumber blockNum)
     880              : {
     881     63063435 :     return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
     882              : }
     883              : 
     884              : /*
     885              :  * ReadBufferExtended -- returns a buffer containing the requested
     886              :  *      block of the requested relation.  If the blknum
     887              :  *      requested is P_NEW, extend the relation file and
     888              :  *      allocate a new block.  (Caller is responsible for
     889              :  *      ensuring that only one backend tries to extend a
     890              :  *      relation at the same time!)
     891              :  *
     892              :  * Returns: the buffer number for the buffer containing
     893              :  *      the block read.  The returned buffer has been pinned.
     894              :  *      Does not return on error --- elog's instead.
     895              :  *
     896              :  * Assume when this function is called, that reln has been opened already.
     897              :  *
     898              :  * In RBM_NORMAL mode, the page is read from disk, and the page header is
     899              :  * validated.  An error is thrown if the page header is not valid.  (But
     900              :  * note that an all-zero page is considered "valid"; see
     901              :  * PageIsVerified().)
     902              :  *
     903              :  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
     904              :  * valid, the page is zeroed instead of throwing an error. This is intended
     905              :  * for non-critical data, where the caller is prepared to repair errors.
     906              :  *
     907              :  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
     908              :  * filled with zeros instead of reading it from disk.  Useful when the caller
     909              :  * is going to fill the page from scratch, since this saves I/O and avoids
     910              :  * unnecessary failure if the page-on-disk has corrupt page headers.
     911              :  * The page is returned locked to ensure that the caller has a chance to
     912              :  * initialize the page before it's made visible to others.
     913              :  * Caution: do not use this mode to read a page that is beyond the relation's
     914              :  * current physical EOF; that is likely to cause problems in md.c when
     915              :  * the page is modified and written out. P_NEW is OK, though.
     916              :  *
     917              :  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
     918              :  * a cleanup-strength lock on the page.
     919              :  *
     920              :  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
     921              :  *
     922              :  * If strategy is not NULL, a nondefault buffer access strategy is used.
     923              :  * See buffer/README for details.
     924              :  */
     925              : inline Buffer
     926     74489663 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
     927              :                    ReadBufferMode mode, BufferAccessStrategy strategy)
     928              : {
     929              :     Buffer      buf;
     930              : 
     931              :     /*
     932              :      * Read the buffer, and update pgstat counters to reflect a cache hit or
     933              :      * miss.  The other-session temp-relation check is enforced by
     934              :      * ReadBuffer_common().
     935              :      */
     936     74489663 :     buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
     937              :                             forkNum, blockNum, mode, strategy);
     938              : 
     939     74489638 :     return buf;
     940              : }
     941              : 
     942              : 
     943              : /*
     944              :  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
     945              :  *      a relcache entry for the relation.
     946              :  *
     947              :  * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
     948              :  * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
     949              :  * cannot be used for temporary relations (and making that work might be
     950              :  * difficult, unless we only want to read temporary relations for our own
     951              :  * ProcNumber).
     952              :  */
     953              : Buffer
     954      5964036 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
     955              :                           BlockNumber blockNum, ReadBufferMode mode,
     956              :                           BufferAccessStrategy strategy, bool permanent)
     957              : {
     958      5964036 :     SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
     959              : 
     960      5964036 :     return ReadBuffer_common(NULL, smgr,
     961              :                              permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
     962              :                              forkNum, blockNum,
     963              :                              mode, strategy);
     964              : }
     965              : 
     966              : /*
     967              :  * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
     968              :  */
     969              : Buffer
     970        56978 : ExtendBufferedRel(BufferManagerRelation bmr,
     971              :                   ForkNumber forkNum,
     972              :                   BufferAccessStrategy strategy,
     973              :                   uint32 flags)
     974              : {
     975              :     Buffer      buf;
     976        56978 :     uint32      extend_by = 1;
     977              : 
     978        56978 :     ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
     979              :                         &buf, &extend_by);
     980              : 
     981        56978 :     return buf;
     982              : }
     983              : 
     984              : /*
     985              :  * Extend relation by multiple blocks.
     986              :  *
     987              :  * Tries to extend the relation by extend_by blocks. Depending on the
     988              :  * availability of resources the relation may end up being extended by a
     989              :  * smaller number of pages (unless an error is thrown, always by at least one
     990              :  * page). *extended_by is updated to the number of pages the relation has been
     991              :  * extended to.
     992              :  *
     993              :  * buffers needs to be an array that is at least extend_by long. Upon
     994              :  * completion, the first extend_by array elements will point to a pinned
     995              :  * buffer.
     996              :  *
     997              :  * If EB_LOCK_FIRST is part of flags, the first returned buffer is
     998              :  * locked. This is useful for callers that want a buffer that is guaranteed to
     999              :  * be empty.
    1000              :  */
    1001              : BlockNumber
    1002       210818 : ExtendBufferedRelBy(BufferManagerRelation bmr,
    1003              :                     ForkNumber fork,
    1004              :                     BufferAccessStrategy strategy,
    1005              :                     uint32 flags,
    1006              :                     uint32 extend_by,
    1007              :                     Buffer *buffers,
    1008              :                     uint32 *extended_by)
    1009              : {
    1010              :     Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
    1011              :     Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
    1012              :     Assert(extend_by > 0);
    1013              : 
    1014       210818 :     if (bmr.relpersistence == '\0')
    1015       210818 :         bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
    1016              : 
    1017       210818 :     return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
    1018              :                                    extend_by, InvalidBlockNumber,
    1019              :                                    buffers, extended_by);
    1020              : }
    1021              : 
    1022              : /*
    1023              :  * Extend the relation so it is at least extend_to blocks large, return buffer
    1024              :  * (extend_to - 1).
    1025              :  *
    1026              :  * This is useful for callers that want to write a specific page, regardless
    1027              :  * of the current size of the relation (e.g. useful for visibilitymap and for
    1028              :  * crash recovery).
    1029              :  */
    1030              : Buffer
    1031        55722 : ExtendBufferedRelTo(BufferManagerRelation bmr,
    1032              :                     ForkNumber fork,
    1033              :                     BufferAccessStrategy strategy,
    1034              :                     uint32 flags,
    1035              :                     BlockNumber extend_to,
    1036              :                     ReadBufferMode mode)
    1037              : {
    1038              :     BlockNumber current_size;
    1039        55722 :     uint32      extended_by = 0;
    1040        55722 :     Buffer      buffer = InvalidBuffer;
    1041              :     Buffer      buffers[64];
    1042              : 
    1043              :     Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
    1044              :     Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
    1045              :     Assert(extend_to != InvalidBlockNumber && extend_to > 0);
    1046              : 
    1047        55722 :     if (bmr.relpersistence == '\0')
    1048         9453 :         bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
    1049              : 
    1050              :     /*
    1051              :      * If desired, create the file if it doesn't exist.  If
    1052              :      * smgr_cached_nblocks[fork] is positive then it must exist, no need for
    1053              :      * an smgrexists call.
    1054              :      */
    1055        55722 :     if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
    1056         9453 :         (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
    1057           29 :          BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
    1058         9424 :         !smgrexists(BMR_GET_SMGR(bmr), fork))
    1059              :     {
    1060         9394 :         LockRelationForExtension(bmr.rel, ExclusiveLock);
    1061              : 
    1062              :         /* recheck, fork might have been created concurrently */
    1063         9394 :         if (!smgrexists(BMR_GET_SMGR(bmr), fork))
    1064         9389 :             smgrcreate(BMR_GET_SMGR(bmr), fork, flags & EB_PERFORMING_RECOVERY);
    1065              : 
    1066         9394 :         UnlockRelationForExtension(bmr.rel, ExclusiveLock);
    1067              :     }
    1068              : 
    1069              :     /*
    1070              :      * If requested, invalidate size cache, so that smgrnblocks asks the
    1071              :      * kernel.
    1072              :      */
    1073        55722 :     if (flags & EB_CLEAR_SIZE_CACHE)
    1074         9453 :         BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
    1075              : 
    1076              :     /*
    1077              :      * Estimate how many pages we'll need to extend by. This avoids acquiring
    1078              :      * unnecessarily many victim buffers.
    1079              :      */
    1080        55722 :     current_size = smgrnblocks(BMR_GET_SMGR(bmr), fork);
    1081              : 
    1082              :     /*
    1083              :      * Since no-one else can be looking at the page contents yet, there is no
    1084              :      * difference between an exclusive lock and a cleanup-strength lock. Note
    1085              :      * that we pass the original mode to ReadBuffer_common() below, when
    1086              :      * falling back to reading the buffer to a concurrent relation extension.
    1087              :      */
    1088        55722 :     if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
    1089        45877 :         flags |= EB_LOCK_TARGET;
    1090              : 
    1091       113640 :     while (current_size < extend_to)
    1092              :     {
    1093        57918 :         uint32      num_pages = lengthof(buffers);
    1094              :         BlockNumber first_block;
    1095              : 
    1096        57918 :         if ((uint64) current_size + num_pages > extend_to)
    1097        57852 :             num_pages = extend_to - current_size;
    1098              : 
    1099        57918 :         first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
    1100              :                                               num_pages, extend_to,
    1101              :                                               buffers, &extended_by);
    1102              : 
    1103        57918 :         current_size = first_block + extended_by;
    1104              :         Assert(num_pages != 0 || current_size >= extend_to);
    1105              : 
    1106       124604 :         for (uint32 i = 0; i < extended_by; i++)
    1107              :         {
    1108        66686 :             if (first_block + i != extend_to - 1)
    1109        10972 :                 ReleaseBuffer(buffers[i]);
    1110              :             else
    1111        55714 :                 buffer = buffers[i];
    1112              :         }
    1113              :     }
    1114              : 
    1115              :     /*
    1116              :      * It's possible that another backend concurrently extended the relation.
    1117              :      * In that case read the buffer.
    1118              :      *
    1119              :      * XXX: Should we control this via a flag?
    1120              :      */
    1121        55722 :     if (buffer == InvalidBuffer)
    1122              :     {
    1123              :         Assert(extended_by == 0);
    1124            8 :         buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
    1125              :                                    fork, extend_to - 1, mode, strategy);
    1126              :     }
    1127              : 
    1128        55722 :     return buffer;
    1129              : }
    1130              : 
    1131              : /*
    1132              :  * Lock and optionally zero a buffer, as part of the implementation of
    1133              :  * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK.  The buffer must be already
    1134              :  * pinned.  If the buffer is not already valid, it is zeroed and made valid.
    1135              :  */
    1136              : static void
    1137       355704 : ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
    1138              : {
    1139              :     BufferDesc *bufHdr;
    1140              :     bool        need_to_zero;
    1141       355704 :     bool        isLocalBuf = BufferIsLocal(buffer);
    1142              :     StartBufferIOResult sbres;
    1143              : 
    1144              :     Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
    1145              : 
    1146       355704 :     if (already_valid)
    1147              :     {
    1148              :         /*
    1149              :          * If the caller already knew the buffer was valid, we can skip some
    1150              :          * header interaction.  The caller just wants to lock the buffer.
    1151              :          */
    1152        38267 :         need_to_zero = false;
    1153              :     }
    1154              :     else
    1155              :     {
    1156       317437 :         if (isLocalBuf)
    1157              :         {
    1158              :             /* Simple case for non-shared buffers. */
    1159           30 :             bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    1160           30 :             sbres = StartLocalBufferIO(bufHdr, true, true, NULL);
    1161              :         }
    1162              :         else
    1163              :         {
    1164              :             /*
    1165              :              * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
    1166              :              * concurrently.  Even though we aren't doing I/O, that ensures
    1167              :              * that we don't zero a page that someone else has pinned.  An
    1168              :              * exclusive content lock wouldn't be enough, because readers are
    1169              :              * allowed to drop the content lock after determining that a tuple
    1170              :              * is visible (see buffer access rules in README).
    1171              :              */
    1172       317407 :             bufHdr = GetBufferDescriptor(buffer - 1);
    1173       317407 :             sbres = StartSharedBufferIO(bufHdr, true, true, NULL);
    1174              :         }
    1175              : 
    1176              :         Assert(sbres != BUFFER_IO_IN_PROGRESS);
    1177       317437 :         need_to_zero = sbres == BUFFER_IO_READY_FOR_IO;
    1178              :     }
    1179              : 
    1180       355704 :     if (need_to_zero)
    1181              :     {
    1182       317437 :         memset(BufferGetPage(buffer), 0, BLCKSZ);
    1183              : 
    1184              :         /*
    1185              :          * Grab the buffer content lock before marking the page as valid, to
    1186              :          * make sure that no other backend sees the zeroed page before the
    1187              :          * caller has had a chance to initialize it.
    1188              :          *
    1189              :          * Since no-one else can be looking at the page contents yet, there is
    1190              :          * no difference between an exclusive lock and a cleanup-strength
    1191              :          * lock. (Note that we cannot use LockBuffer() or
    1192              :          * LockBufferForCleanup() here, because they assert that the buffer is
    1193              :          * already valid.)
    1194              :          */
    1195       317437 :         if (!isLocalBuf)
    1196       317407 :             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    1197              : 
    1198              :         /* Set BM_VALID, terminate IO, and wake up any waiters */
    1199       317437 :         if (isLocalBuf)
    1200           30 :             TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
    1201              :         else
    1202       317407 :             TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
    1203              :     }
    1204        38267 :     else if (!isLocalBuf)
    1205              :     {
    1206              :         /*
    1207              :          * The buffer is valid, so we can't zero it.  The caller still expects
    1208              :          * the page to be locked on return.
    1209              :          */
    1210        38247 :         if (mode == RBM_ZERO_AND_LOCK)
    1211        38186 :             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    1212              :         else
    1213           61 :             LockBufferForCleanup(buffer);
    1214              :     }
    1215       355704 : }
    1216              : 
    1217              : /*
    1218              :  * Pin a buffer for a given block.  *foundPtr is set to true if the block was
    1219              :  * already present, or false if more work is required to either read it in or
    1220              :  * zero it.
    1221              :  */
    1222              : static pg_attribute_always_inline Buffer
    1223     85606283 : PinBufferForBlock(Relation rel,
    1224              :                   SMgrRelation smgr,
    1225              :                   char persistence,
    1226              :                   ForkNumber forkNum,
    1227              :                   BlockNumber blockNum,
    1228              :                   BufferAccessStrategy strategy,
    1229              :                   IOObject io_object,
    1230              :                   IOContext io_context,
    1231              :                   bool *foundPtr)
    1232              : {
    1233              :     BufferDesc *bufHdr;
    1234              : 
    1235              :     Assert(blockNum != P_NEW);
    1236              : 
    1237              :     /* Persistence should be set before */
    1238              :     Assert((persistence == RELPERSISTENCE_TEMP ||
    1239              :             persistence == RELPERSISTENCE_PERMANENT ||
    1240              :             persistence == RELPERSISTENCE_UNLOGGED));
    1241              : 
    1242              :     TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
    1243              :                                        smgr->smgr_rlocator.locator.spcOid,
    1244              :                                        smgr->smgr_rlocator.locator.dbOid,
    1245              :                                        smgr->smgr_rlocator.locator.relNumber,
    1246              :                                        smgr->smgr_rlocator.backend);
    1247              : 
    1248     85606283 :     if (persistence == RELPERSISTENCE_TEMP)
    1249      1648481 :         bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
    1250              :     else
    1251     83957802 :         bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
    1252              :                              strategy, foundPtr, io_context);
    1253              : 
    1254     85606275 :     if (*foundPtr)
    1255     83636654 :         TrackBufferHit(io_object, io_context, rel, persistence, smgr, forkNum, blockNum);
    1256              : 
    1257     85606275 :     if (rel)
    1258              :     {
    1259              :         /*
    1260              :          * While pgBufferUsage's "read" counter isn't bumped unless we reach
    1261              :          * WaitReadBuffers() (so, not for hits, and not for buffers that are
    1262              :          * zeroed instead), the per-relation stats always count them.
    1263              :          */
    1264     79380593 :         pgstat_count_buffer_read(rel);
    1265              :     }
    1266              : 
    1267     85606275 :     return BufferDescriptorGetBuffer(bufHdr);
    1268              : }
    1269              : 
    1270              : /*
    1271              :  * ReadBuffer_common -- common logic for all ReadBuffer variants
    1272              :  *
    1273              :  * smgr is required, rel is optional unless using P_NEW.
    1274              :  */
    1275              : static pg_attribute_always_inline Buffer
    1276     80453707 : ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
    1277              :                   ForkNumber forkNum,
    1278              :                   BlockNumber blockNum, ReadBufferMode mode,
    1279              :                   BufferAccessStrategy strategy)
    1280              : {
    1281              :     ReadBuffersOperation operation;
    1282              :     Buffer      buffer;
    1283              :     int         flags;
    1284              :     char        persistence;
    1285              : 
    1286              :     /*
    1287              :      * Reject attempts to read non-local temporary relations; we would be
    1288              :      * likely to get wrong data since we have no visibility into the owning
    1289              :      * session's local buffers.  This is the canonical place for the check,
    1290              :      * covering the ReadBufferExtended() entry point and any other caller that
    1291              :      * supplies a Relation.
    1292              :      */
    1293     80453707 :     if (rel && RELATION_IS_OTHER_TEMP(rel))
    1294            2 :         ereport(ERROR,
    1295              :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1296              :                  errmsg("cannot access temporary tables of other sessions")));
    1297              : 
    1298              :     /*
    1299              :      * Backward compatibility path, most code should use ExtendBufferedRel()
    1300              :      * instead, as acquiring the extension lock inside ExtendBufferedRel()
    1301              :      * scales a lot better.
    1302              :      */
    1303     80453705 :     if (unlikely(blockNum == P_NEW))
    1304              :     {
    1305          322 :         uint32      flags = EB_SKIP_EXTENSION_LOCK;
    1306              : 
    1307              :         /*
    1308              :          * Since no-one else can be looking at the page contents yet, there is
    1309              :          * no difference between an exclusive lock and a cleanup-strength
    1310              :          * lock.
    1311              :          */
    1312          322 :         if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
    1313            0 :             flags |= EB_LOCK_FIRST;
    1314              : 
    1315          322 :         return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
    1316              :     }
    1317              : 
    1318     80453383 :     if (rel)
    1319     74489347 :         persistence = rel->rd_rel->relpersistence;
    1320              :     else
    1321      5964036 :         persistence = smgr_persistence;
    1322              : 
    1323     80453383 :     if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
    1324              :                  mode == RBM_ZERO_AND_LOCK))
    1325              :     {
    1326              :         bool        found;
    1327              :         IOContext   io_context;
    1328              :         IOObject    io_object;
    1329              : 
    1330       355704 :         if (persistence == RELPERSISTENCE_TEMP)
    1331              :         {
    1332           50 :             io_context = IOCONTEXT_NORMAL;
    1333           50 :             io_object = IOOBJECT_TEMP_RELATION;
    1334              :         }
    1335              :         else
    1336              :         {
    1337       355654 :             io_context = IOContextForStrategy(strategy);
    1338       355654 :             io_object = IOOBJECT_RELATION;
    1339              :         }
    1340              : 
    1341       355704 :         buffer = PinBufferForBlock(rel, smgr, persistence,
    1342              :                                    forkNum, blockNum, strategy,
    1343              :                                    io_object, io_context, &found);
    1344       355704 :         ZeroAndLockBuffer(buffer, mode, found);
    1345       355704 :         return buffer;
    1346              :     }
    1347              : 
    1348              :     /*
    1349              :      * Signal that we are going to immediately wait. If we're immediately
    1350              :      * waiting, there is no benefit in actually executing the IO
    1351              :      * asynchronously, it would just add dispatch overhead.
    1352              :      */
    1353     80097679 :     flags = READ_BUFFERS_SYNCHRONOUSLY;
    1354     80097679 :     if (mode == RBM_ZERO_ON_ERROR)
    1355      2003832 :         flags |= READ_BUFFERS_ZERO_ON_ERROR;
    1356     80097679 :     operation.smgr = smgr;
    1357     80097679 :     operation.rel = rel;
    1358     80097679 :     operation.persistence = persistence;
    1359     80097679 :     operation.forknum = forkNum;
    1360     80097679 :     operation.strategy = strategy;
    1361     80097679 :     if (StartReadBuffer(&operation,
    1362              :                         &buffer,
    1363              :                         blockNum,
    1364              :                         flags))
    1365       767401 :         WaitReadBuffers(&operation);
    1366              : 
    1367     80097656 :     return buffer;
    1368              : }
    1369              : 
    1370              : static pg_attribute_always_inline bool
    1371     85066290 : StartReadBuffersImpl(ReadBuffersOperation *operation,
    1372              :                      Buffer *buffers,
    1373              :                      BlockNumber blockNum,
    1374              :                      int *nblocks,
    1375              :                      int flags,
    1376              :                      bool allow_forwarding)
    1377              : {
    1378     85066290 :     int         actual_nblocks = *nblocks;
    1379     85066290 :     int         maxcombine = 0;
    1380              :     bool        did_start_io;
    1381              :     IOContext   io_context;
    1382              :     IOObject    io_object;
    1383              : 
    1384              :     Assert(*nblocks == 1 || allow_forwarding);
    1385              :     Assert(*nblocks > 0);
    1386              :     Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
    1387              : 
    1388              :     /* see comments in ReadBuffer_common */
    1389     85066290 :     if (operation->rel && RELATION_IS_OTHER_TEMP(operation->rel))
    1390            0 :         ereport(ERROR,
    1391              :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1392              :                  errmsg("cannot access temporary tables of other sessions")));
    1393              : 
    1394     85066290 :     if (operation->persistence == RELPERSISTENCE_TEMP)
    1395              :     {
    1396      1640235 :         io_context = IOCONTEXT_NORMAL;
    1397      1640235 :         io_object = IOOBJECT_TEMP_RELATION;
    1398              :     }
    1399              :     else
    1400              :     {
    1401     83426055 :         io_context = IOContextForStrategy(operation->strategy);
    1402     83426055 :         io_object = IOOBJECT_RELATION;
    1403              :     }
    1404              : 
    1405     86718488 :     for (int i = 0; i < actual_nblocks; ++i)
    1406              :     {
    1407              :         bool        found;
    1408              : 
    1409     85252809 :         if (allow_forwarding && buffers[i] != InvalidBuffer)
    1410         2230 :         {
    1411              :             BufferDesc *bufHdr;
    1412              : 
    1413              :             /*
    1414              :              * This is a buffer that was pinned by an earlier call to
    1415              :              * StartReadBuffers(), but couldn't be handled in one operation at
    1416              :              * that time.  The operation was split, and the caller has passed
    1417              :              * an already pinned buffer back to us to handle the rest of the
    1418              :              * operation.  It must continue at the expected block number.
    1419              :              */
    1420              :             Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
    1421              : 
    1422              :             /*
    1423              :              * It might be an already valid buffer (a hit) that followed the
    1424              :              * final contiguous block of an earlier I/O (a miss) marking the
    1425              :              * end of it, or a buffer that some other backend has since made
    1426              :              * valid by performing the I/O for us, in which case we can handle
    1427              :              * it as a hit now.  It is safe to check for a BM_VALID flag with
    1428              :              * a relaxed load, because we got a fresh view of it while pinning
    1429              :              * it in the previous call.
    1430              :              *
    1431              :              * On the other hand if we don't see BM_VALID yet, it must be an
    1432              :              * I/O that was split by the previous call and we need to try to
    1433              :              * start a new I/O from this block.  We're also racing against any
    1434              :              * other backend that might start the I/O or even manage to mark
    1435              :              * it BM_VALID after this check, but StartBufferIO() will handle
    1436              :              * those cases.
    1437              :              */
    1438         2230 :             if (BufferIsLocal(buffers[i]))
    1439           16 :                 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
    1440              :             else
    1441         2214 :                 bufHdr = GetBufferDescriptor(buffers[i] - 1);
    1442              :             Assert(pg_atomic_read_u64(&bufHdr->state) & BM_TAG_VALID);
    1443         2230 :             found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
    1444              :         }
    1445              :         else
    1446              :         {
    1447     85250571 :             buffers[i] = PinBufferForBlock(operation->rel,
    1448              :                                            operation->smgr,
    1449     85250579 :                                            operation->persistence,
    1450              :                                            operation->forknum,
    1451              :                                            blockNum + i,
    1452              :                                            operation->strategy,
    1453              :                                            io_object, io_context,
    1454              :                                            &found);
    1455              :         }
    1456              : 
    1457     85252801 :         if (found)
    1458              :         {
    1459              :             /*
    1460              :              * We have a hit.  If it's the first block in the requested range,
    1461              :              * we can return it immediately and report that WaitReadBuffers()
    1462              :              * does not need to be called.  If the initial value of *nblocks
    1463              :              * was larger, the caller will have to call again for the rest.
    1464              :              */
    1465     83600603 :             if (i == 0)
    1466              :             {
    1467     83598385 :                 *nblocks = 1;
    1468              : 
    1469              : #ifdef USE_ASSERT_CHECKING
    1470              : 
    1471              :                 /*
    1472              :                  * Initialize enough of ReadBuffersOperation to make
    1473              :                  * CheckReadBuffersOperation() work. Outside of assertions
    1474              :                  * that's not necessary when no IO is issued.
    1475              :                  */
    1476              :                 operation->buffers = buffers;
    1477              :                 operation->blocknum = blockNum;
    1478              :                 operation->nblocks = 1;
    1479              :                 operation->nblocks_done = 1;
    1480              :                 CheckReadBuffersOperation(operation, true);
    1481              : #endif
    1482     83598385 :                 return false;
    1483              :             }
    1484              : 
    1485              :             /*
    1486              :              * Otherwise we already have an I/O to perform, but this block
    1487              :              * can't be included as it is already valid.  Split the I/O here.
    1488              :              * There may or may not be more blocks requiring I/O after this
    1489              :              * one, we haven't checked, but they can't be contiguous with this
    1490              :              * one in the way.  We'll leave this buffer pinned, forwarding it
    1491              :              * to the next call, avoiding the need to unpin it here and re-pin
    1492              :              * it in the next call.
    1493              :              */
    1494         2218 :             actual_nblocks = i;
    1495         2218 :             break;
    1496              :         }
    1497              :         else
    1498              :         {
    1499              :             /*
    1500              :              * Check how many blocks we can cover with the same IO. The smgr
    1501              :              * implementation might e.g. be limited due to a segment boundary.
    1502              :              */
    1503      1652198 :             if (i == 0 && actual_nblocks > 1)
    1504              :             {
    1505        37828 :                 maxcombine = smgrmaxcombine(operation->smgr,
    1506              :                                             operation->forknum,
    1507              :                                             blockNum);
    1508        37828 :                 if (unlikely(maxcombine < actual_nblocks))
    1509              :                 {
    1510            0 :                     elog(DEBUG2, "limiting nblocks at %u from %u to %u",
    1511              :                          blockNum, actual_nblocks, maxcombine);
    1512            0 :                     actual_nblocks = maxcombine;
    1513              :                 }
    1514              :             }
    1515              :         }
    1516              :     }
    1517      1467897 :     *nblocks = actual_nblocks;
    1518              : 
    1519              :     /* Populate information needed for I/O. */
    1520      1467897 :     operation->buffers = buffers;
    1521      1467897 :     operation->blocknum = blockNum;
    1522      1467897 :     operation->flags = flags;
    1523      1467897 :     operation->nblocks = actual_nblocks;
    1524      1467897 :     operation->nblocks_done = 0;
    1525      1467897 :     pgaio_wref_clear(&operation->io_wref);
    1526              : 
    1527              :     /*
    1528              :      * When using AIO, start the IO in the background. If not, issue prefetch
    1529              :      * requests if desired by the caller.
    1530              :      *
    1531              :      * The reason we have a dedicated path for IOMETHOD_SYNC here is to
    1532              :      * de-risk the introduction of AIO somewhat. It's a large architectural
    1533              :      * change, with lots of chances for unanticipated performance effects.
    1534              :      *
    1535              :      * Use of IOMETHOD_SYNC already leads to not actually performing IO
    1536              :      * asynchronously, but without the check here we'd execute IO earlier than
    1537              :      * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
    1538              :      */
    1539      1467897 :     if (io_method != IOMETHOD_SYNC)
    1540              :     {
    1541              :         /*
    1542              :          * Try to start IO asynchronously. It's possible that no IO needs to
    1543              :          * be started, if another backend already performed the IO.
    1544              :          *
    1545              :          * Note that if an IO is started, it might not cover the entire
    1546              :          * requested range, e.g. because an intermediary block has been read
    1547              :          * in by another backend.  In that case any "trailing" buffers we
    1548              :          * already pinned above will be "forwarded" by read_stream.c to the
    1549              :          * next call to StartReadBuffers().
    1550              :          *
    1551              :          * This is signalled to the caller by decrementing *nblocks *and*
    1552              :          * reducing operation->nblocks. The latter is done here, but not below
    1553              :          * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
    1554              :          * overall read size anymore, we need to retry until done in its
    1555              :          * entirety or until failed.
    1556              :          */
    1557      1466522 :         did_start_io = AsyncReadBuffers(operation, nblocks);
    1558              : 
    1559      1466507 :         operation->nblocks = *nblocks;
    1560              :     }
    1561              :     else
    1562              :     {
    1563         1375 :         operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
    1564              : 
    1565         1375 :         if (flags & READ_BUFFERS_ISSUE_ADVICE)
    1566              :         {
    1567              :             /*
    1568              :              * In theory we should only do this if PinBufferForBlock() had to
    1569              :              * allocate new buffers above.  That way, if two calls to
    1570              :              * StartReadBuffers() were made for the same blocks before
    1571              :              * WaitReadBuffers(), only the first would issue the advice.
    1572              :              * That'd be a better simulation of true asynchronous I/O, which
    1573              :              * would only start the I/O once, but isn't done here for
    1574              :              * simplicity.
    1575              :              */
    1576           19 :             smgrprefetch(operation->smgr,
    1577              :                          operation->forknum,
    1578              :                          blockNum,
    1579              :                          actual_nblocks);
    1580              :         }
    1581              : 
    1582              :         /*
    1583              :          * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
    1584              :          * will initiate the necessary IO.
    1585              :          */
    1586         1375 :         did_start_io = true;
    1587              :     }
    1588              : 
    1589      1467882 :     CheckReadBuffersOperation(operation, !did_start_io);
    1590              : 
    1591      1467882 :     return did_start_io;
    1592              : }
    1593              : 
    1594              : /*
    1595              :  * Begin reading a range of blocks beginning at blockNum and extending for
    1596              :  * *nblocks.  *nblocks and the buffers array are in/out parameters.  On entry,
    1597              :  * the buffers elements covered by *nblocks must hold either InvalidBuffer or
    1598              :  * buffers forwarded by an earlier call to StartReadBuffers() that was split
    1599              :  * and is now being continued.  On return, *nblocks holds the number of blocks
    1600              :  * accepted by this operation.  If it is less than the original number then
    1601              :  * this operation has been split, but buffer elements up to the original
    1602              :  * requested size may hold forwarded buffers to be used for a continuing
    1603              :  * operation.  The caller must either start a new I/O beginning at the block
    1604              :  * immediately following the blocks accepted by this call and pass those
    1605              :  * buffers back in, or release them if it chooses not to.  It shouldn't make
    1606              :  * any other use of or assumptions about forwarded buffers.
    1607              :  *
    1608              :  * If false is returned, no I/O is necessary and the buffers covered by
    1609              :  * *nblocks on exit are valid and ready to be accessed.  If true is returned,
    1610              :  * an I/O has been started, and WaitReadBuffers() must be called with the same
    1611              :  * operation object before the buffers covered by *nblocks on exit can be
    1612              :  * accessed.  Along with the operation object, the caller-supplied array of
    1613              :  * buffers must remain valid until WaitReadBuffers() is called, and any
    1614              :  * forwarded buffers must also be preserved for a continuing call unless
    1615              :  * they are explicitly released.
    1616              :  */
    1617              : bool
    1618      2172515 : StartReadBuffers(ReadBuffersOperation *operation,
    1619              :                  Buffer *buffers,
    1620              :                  BlockNumber blockNum,
    1621              :                  int *nblocks,
    1622              :                  int flags)
    1623              : {
    1624      2172515 :     return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
    1625              :                                 true /* expect forwarded buffers */ );
    1626              : }
    1627              : 
    1628              : /*
    1629              :  * Single block version of the StartReadBuffers().  This might save a few
    1630              :  * instructions when called from another translation unit, because it is
    1631              :  * specialized for nblocks == 1.
    1632              :  *
    1633              :  * This version does not support "forwarded" buffers: they cannot be created
    1634              :  * by reading only one block and *buffer is ignored on entry.
    1635              :  */
    1636              : bool
    1637     82893775 : StartReadBuffer(ReadBuffersOperation *operation,
    1638              :                 Buffer *buffer,
    1639              :                 BlockNumber blocknum,
    1640              :                 int flags)
    1641              : {
    1642     82893775 :     int         nblocks = 1;
    1643              :     bool        result;
    1644              : 
    1645     82893775 :     result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
    1646              :                                   false /* single block, no forwarding */ );
    1647              :     Assert(nblocks == 1);       /* single block can't be short */
    1648              : 
    1649     82893760 :     return result;
    1650              : }
    1651              : 
    1652              : /*
    1653              :  * Perform sanity checks on the ReadBuffersOperation.
    1654              :  */
    1655              : static void
    1656      4403419 : CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
    1657              : {
    1658              : #ifdef USE_ASSERT_CHECKING
    1659              :     Assert(operation->nblocks_done <= operation->nblocks);
    1660              :     Assert(!is_complete || operation->nblocks == operation->nblocks_done);
    1661              : 
    1662              :     for (int i = 0; i < operation->nblocks; i++)
    1663              :     {
    1664              :         Buffer      buffer = operation->buffers[i];
    1665              :         BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
    1666              :             GetLocalBufferDescriptor(-buffer - 1) :
    1667              :             GetBufferDescriptor(buffer - 1);
    1668              : 
    1669              :         Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
    1670              :         Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_TAG_VALID);
    1671              : 
    1672              :         if (i < operation->nblocks_done)
    1673              :             Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_VALID);
    1674              :     }
    1675              : #endif
    1676      4403419 : }
    1677              : 
    1678              : /*
    1679              :  * We track various stats related to buffer hits. Because this is done in a
    1680              :  * few separate places, this helper exists for convenience.
    1681              :  */
    1682              : static pg_attribute_always_inline void
    1683     83639815 : TrackBufferHit(IOObject io_object, IOContext io_context,
    1684              :                Relation rel, char persistence, SMgrRelation smgr,
    1685              :                ForkNumber forknum, BlockNumber blocknum)
    1686              : {
    1687              :     TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum,
    1688              :                                       blocknum,
    1689              :                                       smgr->smgr_rlocator.locator.spcOid,
    1690              :                                       smgr->smgr_rlocator.locator.dbOid,
    1691              :                                       smgr->smgr_rlocator.locator.relNumber,
    1692              :                                       smgr->smgr_rlocator.backend,
    1693              :                                       true);
    1694              : 
    1695     83639815 :     if (persistence == RELPERSISTENCE_TEMP)
    1696      1637432 :         pgBufferUsage.local_blks_hit += 1;
    1697              :     else
    1698     82002383 :         pgBufferUsage.shared_blks_hit += 1;
    1699              : 
    1700     83639815 :     pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
    1701              : 
    1702     83639815 :     if (VacuumCostActive)
    1703      3041845 :         VacuumCostBalance += VacuumCostPageHit;
    1704              : 
    1705     83639815 :     if (rel)
    1706     77929290 :         pgstat_count_buffer_hit(rel);
    1707     83639815 : }
    1708              : 
    1709              : /*
    1710              :  * Helper for WaitReadBuffers() that processes the results of a readv
    1711              :  * operation, raising an error if necessary.
    1712              :  */
    1713              : static void
    1714      1464437 : ProcessReadBuffersResult(ReadBuffersOperation *operation)
    1715              : {
    1716      1464437 :     PgAioReturn *aio_ret = &operation->io_return;
    1717      1464437 :     PgAioResultStatus rs = aio_ret->result.status;
    1718      1464437 :     int         newly_read_blocks = 0;
    1719              : 
    1720              :     Assert(pgaio_wref_valid(&operation->io_wref));
    1721              :     Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
    1722              : 
    1723              :     /*
    1724              :      * SMGR reports the number of blocks successfully read as the result of
    1725              :      * the IO operation. Thus we can simply add that to ->nblocks_done.
    1726              :      */
    1727              : 
    1728      1464437 :     if (likely(rs != PGAIO_RS_ERROR))
    1729      1464408 :         newly_read_blocks = aio_ret->result.result;
    1730              : 
    1731      1464437 :     if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
    1732           46 :         pgaio_result_report(aio_ret->result, &aio_ret->target_data,
    1733              :                             rs == PGAIO_RS_ERROR ? ERROR : WARNING);
    1734      1464391 :     else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
    1735              :     {
    1736              :         /*
    1737              :          * We'll retry, so we just emit a debug message to the server log (or
    1738              :          * not even that in prod scenarios).
    1739              :          */
    1740          109 :         pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
    1741          109 :         elog(DEBUG3, "partial read, will retry");
    1742              :     }
    1743              : 
    1744              :     Assert(newly_read_blocks > 0);
    1745              :     Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
    1746              : 
    1747      1464408 :     operation->nblocks_done += newly_read_blocks;
    1748              : 
    1749              :     Assert(operation->nblocks_done <= operation->nblocks);
    1750      1464408 : }
    1751              : 
    1752              : /*
    1753              :  * Wait for the IO operation initiated by StartReadBuffers() et al to
    1754              :  * complete.
    1755              :  *
    1756              :  * Returns true if we needed to wait for the IO operation, false otherwise.
    1757              :  */
    1758              : bool
    1759      1467039 : WaitReadBuffers(ReadBuffersOperation *operation)
    1760              : {
    1761      1467039 :     PgAioReturn *aio_ret = &operation->io_return;
    1762              :     IOContext   io_context;
    1763              :     IOObject    io_object;
    1764      1467039 :     bool        needed_wait = false;
    1765              : 
    1766      1467039 :     if (operation->persistence == RELPERSISTENCE_TEMP)
    1767              :     {
    1768         2425 :         io_context = IOCONTEXT_NORMAL;
    1769         2425 :         io_object = IOOBJECT_TEMP_RELATION;
    1770              :     }
    1771              :     else
    1772              :     {
    1773      1464614 :         io_context = IOContextForStrategy(operation->strategy);
    1774      1464614 :         io_object = IOOBJECT_RELATION;
    1775              :     }
    1776              : 
    1777              :     /*
    1778              :      * If we get here without an IO operation having been issued, the
    1779              :      * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
    1780              :      * caller should not have called WaitReadBuffers().
    1781              :      *
    1782              :      * In the case of IOMETHOD_SYNC, we start - as we used to before the
    1783              :      * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
    1784              :      * of the retry logic below, no extra code is required.
    1785              :      *
    1786              :      * This path is expected to eventually go away.
    1787              :      */
    1788      1467039 :     if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
    1789            0 :         elog(ERROR, "waiting for read operation that didn't read");
    1790              : 
    1791              :     /*
    1792              :      * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
    1793              :      * done. We may need multiple retries, not just because we could get
    1794              :      * multiple partial reads, but also because some of the remaining
    1795              :      * to-be-read buffers may have been read in by other backends, limiting
    1796              :      * the IO size.
    1797              :      */
    1798              :     while (true)
    1799         1489 :     {
    1800              :         int         ignored_nblocks_progress;
    1801              : 
    1802      1468528 :         CheckReadBuffersOperation(operation, false);
    1803              : 
    1804              :         /*
    1805              :          * If there is an IO associated with the operation, we may need to
    1806              :          * wait for it.
    1807              :          */
    1808      1468528 :         if (pgaio_wref_valid(&operation->io_wref))
    1809              :         {
    1810              :             /*
    1811              :              * Track the time spent waiting for the IO to complete. As
    1812              :              * tracking a wait even if we don't actually need to wait
    1813              :              *
    1814              :              * a) is not cheap, due to the timestamping overhead
    1815              :              *
    1816              :              * b) reports some time as waiting, even if we never waited
    1817              :              *
    1818              :              * we first check if we already know the IO is complete.
    1819              :              *
    1820              :              * Note that operation->io_return is uninitialized for foreign IO,
    1821              :              * so we cannot use the cheaper PGAIO_RS_UNKNOWN pre-check.
    1822              :              */
    1823      1467144 :             if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
    1824       675165 :                 !pgaio_wref_check_done(&operation->io_wref))
    1825              :             {
    1826       331657 :                 instr_time  io_start = pgstat_prepare_io_time(track_io_timing);
    1827              : 
    1828       331657 :                 pgaio_wref_wait(&operation->io_wref);
    1829       331656 :                 needed_wait = true;
    1830              : 
    1831              :                 /*
    1832              :                  * The IO operation itself was already counted earlier, in
    1833              :                  * AsyncReadBuffers(), this just accounts for the wait time.
    1834              :                  */
    1835       331656 :                 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
    1836              :                                         io_start, 0, 0);
    1837              :             }
    1838              :             else
    1839              :             {
    1840              :                 Assert(pgaio_wref_check_done(&operation->io_wref));
    1841              :             }
    1842              : 
    1843      1467143 :             if (unlikely(operation->foreign_io))
    1844              :             {
    1845         2706 :                 Buffer      buffer = operation->buffers[operation->nblocks_done];
    1846         2706 :                 BufferDesc *desc = BufferIsLocal(buffer) ?
    1847         2706 :                     GetLocalBufferDescriptor(-buffer - 1) :
    1848         2706 :                     GetBufferDescriptor(buffer - 1);
    1849         2706 :                 uint64      buf_state = pg_atomic_read_u64(&desc->state);
    1850              : 
    1851         2706 :                 if (buf_state & BM_VALID)
    1852              :                 {
    1853         2704 :                     BlockNumber blocknum = operation->blocknum + operation->nblocks_done;
    1854              : 
    1855         2704 :                     operation->nblocks_done += 1;
    1856              :                     Assert(operation->nblocks_done <= operation->nblocks);
    1857              : 
    1858              :                     /*
    1859              :                      * Track this as a 'hit' for this backend. The backend
    1860              :                      * performing the IO will track it as a 'read'.
    1861              :                      */
    1862         2704 :                     TrackBufferHit(io_object, io_context,
    1863         2704 :                                    operation->rel, operation->persistence,
    1864              :                                    operation->smgr, operation->forknum,
    1865              :                                    blocknum);
    1866              :                 }
    1867              : 
    1868              :                 /*
    1869              :                  * If the foreign IO failed and left the buffer invalid,
    1870              :                  * nblocks_done is not incremented. The retry loop below will
    1871              :                  * call AsyncReadBuffers() which will attempt the IO itself.
    1872              :                  */
    1873              :             }
    1874              :             else
    1875              :             {
    1876              :                 /*
    1877              :                  * We now are sure the IO completed. Check the results. This
    1878              :                  * includes reporting on errors if there were any.
    1879              :                  */
    1880      1464437 :                 ProcessReadBuffersResult(operation);
    1881              :             }
    1882              :         }
    1883              : 
    1884              :         /*
    1885              :          * Most of the time, the one IO we already started, will read in
    1886              :          * everything.  But we need to deal with partial reads and buffers not
    1887              :          * needing IO anymore.
    1888              :          */
    1889      1468498 :         if (operation->nblocks_done == operation->nblocks)
    1890      1467009 :             break;
    1891              : 
    1892         1489 :         CHECK_FOR_INTERRUPTS();
    1893              : 
    1894              :         /*
    1895              :          * If the IO completed only partially, we need to perform additional
    1896              :          * work, consider that a form of having had to wait.
    1897              :          */
    1898         1489 :         needed_wait = true;
    1899              : 
    1900              :         /*
    1901              :          * This may only complete the IO partially, either because some
    1902              :          * buffers were already valid, or because of a partial read.
    1903              :          *
    1904              :          * NB: In contrast to after the AsyncReadBuffers() call in
    1905              :          * StartReadBuffers(), we do *not* reduce
    1906              :          * ReadBuffersOperation->nblocks here, callers expect the full
    1907              :          * operation to be completed at this point (as more operations may
    1908              :          * have been queued).
    1909              :          */
    1910         1489 :         AsyncReadBuffers(operation, &ignored_nblocks_progress);
    1911              :     }
    1912              : 
    1913      1467009 :     CheckReadBuffersOperation(operation, true);
    1914              : 
    1915              :     /* NB: READ_DONE tracepoint was already executed in completion callback */
    1916      1467009 :     return needed_wait;
    1917              : }
    1918              : 
    1919              : /*
    1920              :  * Initiate IO for the ReadBuffersOperation
    1921              :  *
    1922              :  * This function only starts a single IO at a time. The size of the IO may be
    1923              :  * limited to below the to-be-read blocks, if one of the buffers has
    1924              :  * concurrently been read in. If the first to-be-read buffer is already valid,
    1925              :  * no IO will be issued.
    1926              :  *
    1927              :  * To support retries after partial reads, the first operation->nblocks_done
    1928              :  * buffers are skipped.
    1929              :  *
    1930              :  * On return *nblocks_progress is updated to reflect the number of buffers
    1931              :  * affected by the call. If the first buffer is valid, *nblocks_progress is
    1932              :  * set to 1 and operation->nblocks_done is incremented.
    1933              :  *
    1934              :  * Returns true if IO was initiated or is already in progress (foreign IO),
    1935              :  * false if the buffer was already valid.
    1936              :  */
    1937              : static bool
    1938      1468011 : AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
    1939              : {
    1940      1468011 :     Buffer     *buffers = &operation->buffers[0];
    1941      1468011 :     int         flags = operation->flags;
    1942      1468011 :     ForkNumber  forknum = operation->forknum;
    1943      1468011 :     char        persistence = operation->persistence;
    1944      1468011 :     int16       nblocks_done = operation->nblocks_done;
    1945      1468011 :     BlockNumber blocknum = operation->blocknum + nblocks_done;
    1946      1468011 :     Buffer     *io_buffers = &operation->buffers[nblocks_done];
    1947      1468011 :     int         io_buffers_len = 0;
    1948              :     PgAioHandle *ioh;
    1949      1468011 :     uint32      ioh_flags = 0;
    1950              :     void       *io_pages[MAX_IO_COMBINE_LIMIT];
    1951              :     IOContext   io_context;
    1952              :     IOObject    io_object;
    1953              :     instr_time  io_start;
    1954              :     StartBufferIOResult status;
    1955              : 
    1956      1468011 :     if (persistence == RELPERSISTENCE_TEMP)
    1957              :     {
    1958         2817 :         io_context = IOCONTEXT_NORMAL;
    1959         2817 :         io_object = IOOBJECT_TEMP_RELATION;
    1960              :     }
    1961              :     else
    1962              :     {
    1963      1465194 :         io_context = IOContextForStrategy(operation->strategy);
    1964      1465194 :         io_object = IOOBJECT_RELATION;
    1965              :     }
    1966              : 
    1967              :     /*
    1968              :      * When this IO is executed synchronously, either because the caller will
    1969              :      * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
    1970              :      * the AIO subsystem needs to know.
    1971              :      */
    1972      1468011 :     if (flags & READ_BUFFERS_SYNCHRONOUSLY)
    1973       783871 :         ioh_flags |= PGAIO_HF_SYNCHRONOUS;
    1974              : 
    1975      1468011 :     if (persistence == RELPERSISTENCE_TEMP)
    1976         2817 :         ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
    1977              : 
    1978              :     /*
    1979              :      * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
    1980              :      * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
    1981              :      * set globally, but on a per-session basis. The completion callback,
    1982              :      * which may be run in other processes, e.g. in IO workers, may have a
    1983              :      * different value of the zero_damaged_pages GUC.
    1984              :      *
    1985              :      * XXX: We probably should eventually use a different flag for
    1986              :      * zero_damaged_pages, so we can report different log levels / error codes
    1987              :      * for zero_damaged_pages and ZERO_ON_ERROR.
    1988              :      */
    1989      1468011 :     if (zero_damaged_pages)
    1990           16 :         flags |= READ_BUFFERS_ZERO_ON_ERROR;
    1991              : 
    1992              :     /*
    1993              :      * For the same reason as with zero_damaged_pages we need to use this
    1994              :      * backend's ignore_checksum_failure value.
    1995              :      */
    1996      1468011 :     if (ignore_checksum_failure)
    1997            8 :         flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
    1998              : 
    1999              : 
    2000              :     /*
    2001              :      * To be allowed to report stats in the local completion callback we need
    2002              :      * to prepare to report stats now. This ensures we can safely report the
    2003              :      * checksum failure even in a critical section.
    2004              :      */
    2005      1468011 :     pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
    2006              : 
    2007              :     /*
    2008              :      * We must get an IO handle before StartBufferIO(), as pgaio_io_acquire()
    2009              :      * might block, which we don't want after setting IO_IN_PROGRESS. If we
    2010              :      * don't need to do the IO, we'll release the handle.
    2011              :      *
    2012              :      * If we need to wait for IO before we can get a handle, submit
    2013              :      * already-staged IO first, so that other backends don't need to wait.
    2014              :      * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
    2015              :      * wait for already submitted IO, which doesn't require additional locks,
    2016              :      * but it could still cause undesirable waits.
    2017              :      *
    2018              :      * A secondary benefit is that this would allow us to measure the time in
    2019              :      * pgaio_io_acquire() without causing undue timer overhead in the common,
    2020              :      * non-blocking, case.  However, currently the pgstats infrastructure
    2021              :      * doesn't really allow that, as it a) asserts that an operation can't
    2022              :      * have time without operations b) doesn't have an API to report
    2023              :      * "accumulated" time.
    2024              :      */
    2025      1468011 :     ioh = pgaio_io_acquire_nb(CurrentResourceOwner, &operation->io_return);
    2026      1468011 :     if (unlikely(!ioh))
    2027              :     {
    2028         3289 :         pgaio_submit_staged();
    2029         3289 :         ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return);
    2030              :     }
    2031              : 
    2032      1468011 :     operation->foreign_io = false;
    2033      1468011 :     pgaio_wref_clear(&operation->io_wref);
    2034              : 
    2035              :     /*
    2036              :      * Try to start IO on the first buffer in a new run of blocks. If AIO is
    2037              :      * in progress, be it in this backend or another backend, we just
    2038              :      * associate the wait reference with the operation and wait in
    2039              :      * WaitReadBuffers(). This turns out to be important for performance in
    2040              :      * two workloads:
    2041              :      *
    2042              :      * 1) A read stream that has to read the same block multiple times within
    2043              :      * the readahead distance. This can happen e.g. for the table accesses of
    2044              :      * an index scan.
    2045              :      *
    2046              :      * 2) Concurrent scans by multiple backends on the same relation.
    2047              :      *
    2048              :      * If we were to synchronously wait for the in-progress IO, we'd not be
    2049              :      * able to keep enough I/O in flight.
    2050              :      *
    2051              :      * If we do find there is ongoing I/O for the buffer, we set up a 1-block
    2052              :      * ReadBuffersOperation that WaitReadBuffers then can wait on.
    2053              :      *
    2054              :      * It's possible that another backend has started IO on the buffer but not
    2055              :      * yet set its wait reference. In this case, we have no choice but to wait
    2056              :      * for either the wait reference to be valid or the IO to be done.
    2057              :      */
    2058      1468011 :     status = StartBufferIO(buffers[nblocks_done], true, true,
    2059              :                            &operation->io_wref);
    2060      1468011 :     if (status != BUFFER_IO_READY_FOR_IO)
    2061              :     {
    2062         3163 :         pgaio_io_release(ioh);
    2063         3163 :         *nblocks_progress = 1;
    2064         3163 :         if (status == BUFFER_IO_ALREADY_DONE)
    2065              :         {
    2066              :             /*
    2067              :              * Someone has already completed this block, we're done.
    2068              :              *
    2069              :              * When IO is necessary, ->nblocks_done is updated in
    2070              :              * ProcessReadBuffersResult(), but that is not called if no IO is
    2071              :              * necessary. Thus update here.
    2072              :              */
    2073          457 :             operation->nblocks_done += 1;
    2074              :             Assert(operation->nblocks_done <= operation->nblocks);
    2075              : 
    2076              :             Assert(!pgaio_wref_valid(&operation->io_wref));
    2077              : 
    2078              :             /*
    2079              :              * Report and track this as a 'hit' for this backend, even though
    2080              :              * it must have started out as a miss in PinBufferForBlock(). The
    2081              :              * other backend will track this as a 'read'.
    2082              :              */
    2083          457 :             TrackBufferHit(io_object, io_context,
    2084          457 :                            operation->rel, operation->persistence,
    2085              :                            operation->smgr, operation->forknum,
    2086              :                            blocknum);
    2087          457 :             return false;
    2088              :         }
    2089              : 
    2090              :         /* The IO is already in-progress */
    2091              :         Assert(status == BUFFER_IO_IN_PROGRESS);
    2092              :         Assert(pgaio_wref_valid(&operation->io_wref));
    2093         2706 :         operation->foreign_io = true;
    2094              : 
    2095         2706 :         return true;
    2096              :     }
    2097              : 
    2098              :     Assert(io_buffers[0] == buffers[nblocks_done]);
    2099      1464848 :     io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
    2100      1464848 :     io_buffers_len = 1;
    2101              : 
    2102              :     /*
    2103              :      * NB: As little code as possible should be added between the
    2104              :      * StartBufferIO() above, the further StartBufferIO()s below and the
    2105              :      * smgrstartreadv(), as some of the buffers are now marked as
    2106              :      * IO_IN_PROGRESS and will thus cause other backends to wait.
    2107              :      */
    2108              : 
    2109              :     /*
    2110              :      * How many neighboring-on-disk blocks can we scatter-read into other
    2111              :      * buffers at the same time?  In this case we don't wait if we see an I/O
    2112              :      * already in progress (see comment above).
    2113              :      */
    2114      1649782 :     for (int i = nblocks_done + 1; i < operation->nblocks; i++)
    2115              :     {
    2116              :         /* Must be consecutive block numbers. */
    2117              :         Assert(BufferGetBlockNumber(buffers[i - 1]) ==
    2118              :                BufferGetBlockNumber(buffers[i]) - 1);
    2119              : 
    2120       184938 :         status = StartBufferIO(buffers[i], true, false, NULL);
    2121       184938 :         if (status != BUFFER_IO_READY_FOR_IO)
    2122            4 :             break;
    2123              : 
    2124              :         Assert(io_buffers[io_buffers_len] == buffers[i]);
    2125              : 
    2126       184934 :         io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
    2127              :     }
    2128              : 
    2129              :     /* get a reference to wait for in WaitReadBuffers() */
    2130      1464848 :     pgaio_io_get_wref(ioh, &operation->io_wref);
    2131              : 
    2132              :     /* provide the list of buffers to the completion callbacks */
    2133      1464848 :     pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
    2134              : 
    2135      1464848 :     pgaio_io_register_callbacks(ioh,
    2136              :                                 persistence == RELPERSISTENCE_TEMP ?
    2137              :                                 PGAIO_HCB_LOCAL_BUFFER_READV :
    2138              :                                 PGAIO_HCB_SHARED_BUFFER_READV,
    2139              :                                 flags);
    2140              : 
    2141      1464848 :     pgaio_io_set_flag(ioh, ioh_flags);
    2142              : 
    2143              :     /* ---
    2144              :      * Even though we're trying to issue IO asynchronously, track the time
    2145              :      * in smgrstartreadv():
    2146              :      * - if io_method == IOMETHOD_SYNC, we will always perform the IO
    2147              :      *   immediately
    2148              :      * - the io method might not support the IO (e.g. worker IO for a temp
    2149              :      *   table)
    2150              :      * ---
    2151              :      */
    2152      1464848 :     io_start = pgstat_prepare_io_time(track_io_timing);
    2153      1464848 :     smgrstartreadv(ioh, operation->smgr, forknum,
    2154              :                    blocknum,
    2155              :                    io_pages, io_buffers_len);
    2156      1464833 :     pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
    2157      1464833 :                             io_start, 1, io_buffers_len * BLCKSZ);
    2158              : 
    2159      1464833 :     if (persistence == RELPERSISTENCE_TEMP)
    2160         2815 :         pgBufferUsage.local_blks_read += io_buffers_len;
    2161              :     else
    2162      1462018 :         pgBufferUsage.shared_blks_read += io_buffers_len;
    2163              : 
    2164              :     /*
    2165              :      * Track vacuum cost when issuing IO, not after waiting for it. Otherwise
    2166              :      * we could end up issuing a lot of IO in a short timespan, despite a low
    2167              :      * cost limit.
    2168              :      */
    2169      1464833 :     if (VacuumCostActive)
    2170        20582 :         VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
    2171              : 
    2172      1464833 :     *nblocks_progress = io_buffers_len;
    2173              : 
    2174      1464833 :     return true;
    2175              : }
    2176              : 
    2177              : /*
    2178              :  * BufferAlloc -- subroutine for PinBufferForBlock.  Handles lookup of a shared
    2179              :  *      buffer.  If no buffer exists already, selects a replacement victim and
    2180              :  *      evicts the old page, but does NOT read in new page.
    2181              :  *
    2182              :  * "strategy" can be a buffer replacement strategy object, or NULL for
    2183              :  * the default strategy.  The selected buffer's usage_count is advanced when
    2184              :  * using the default strategy, but otherwise possibly not (see PinBuffer).
    2185              :  *
    2186              :  * The returned buffer is pinned and is already marked as holding the
    2187              :  * desired page.  If it already did have the desired page, *foundPtr is
    2188              :  * set true.  Otherwise, *foundPtr is set false.
    2189              :  *
    2190              :  * io_context is passed as an output parameter to avoid calling
    2191              :  * IOContextForStrategy() when there is a shared buffers hit and no IO
    2192              :  * statistics need be captured.
    2193              :  *
    2194              :  * No locks are held either at entry or exit.
    2195              :  */
    2196              : static pg_attribute_always_inline BufferDesc *
    2197     83957802 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
    2198              :             BlockNumber blockNum,
    2199              :             BufferAccessStrategy strategy,
    2200              :             bool *foundPtr, IOContext io_context)
    2201              : {
    2202              :     BufferTag   newTag;         /* identity of requested block */
    2203              :     uint32      newHash;        /* hash value for newTag */
    2204              :     LWLock     *newPartitionLock;   /* buffer partition lock for it */
    2205              :     int         existing_buf_id;
    2206              :     Buffer      victim_buffer;
    2207              :     BufferDesc *victim_buf_hdr;
    2208              :     uint64      victim_buf_state;
    2209     83957802 :     uint64      set_bits = 0;
    2210              : 
    2211              :     /* Make sure we will have room to remember the buffer pin */
    2212     83957802 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    2213     83957802 :     ReservePrivateRefCountEntry();
    2214              : 
    2215              :     /* create a tag so we can lookup the buffer */
    2216     83957802 :     InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
    2217              : 
    2218              :     /* determine its hash code and partition lock ID */
    2219     83957802 :     newHash = BufTableHashCode(&newTag);
    2220     83957802 :     newPartitionLock = BufMappingPartitionLock(newHash);
    2221              : 
    2222              :     /* see if the block is in the buffer pool already */
    2223     83957802 :     LWLockAcquire(newPartitionLock, LW_SHARED);
    2224     83957802 :     existing_buf_id = BufTableLookup(&newTag, newHash);
    2225     83957802 :     if (existing_buf_id >= 0)
    2226              :     {
    2227              :         BufferDesc *buf;
    2228              :         bool        valid;
    2229              : 
    2230              :         /*
    2231              :          * Found it.  Now, pin the buffer so no one can steal it from the
    2232              :          * buffer pool, and check to see if the correct data has been loaded
    2233              :          * into the buffer.
    2234              :          */
    2235     82001586 :         buf = GetBufferDescriptor(existing_buf_id);
    2236              : 
    2237     82001586 :         valid = PinBuffer(buf, strategy, false);
    2238              : 
    2239              :         /* Can release the mapping lock as soon as we've pinned it */
    2240     82001586 :         LWLockRelease(newPartitionLock);
    2241              : 
    2242     82001586 :         *foundPtr = true;
    2243              : 
    2244     82001586 :         if (!valid)
    2245              :         {
    2246              :             /*
    2247              :              * We can only get here if (a) someone else is still reading in
    2248              :              * the page, (b) a previous read attempt failed, or (c) someone
    2249              :              * called StartReadBuffers() but not yet WaitReadBuffers().
    2250              :              */
    2251         2651 :             *foundPtr = false;
    2252              :         }
    2253              : 
    2254     82001586 :         return buf;
    2255              :     }
    2256              : 
    2257              :     /*
    2258              :      * Didn't find it in the buffer pool.  We'll have to initialize a new
    2259              :      * buffer.  Remember to unlock the mapping lock while doing the work.
    2260              :      */
    2261      1956216 :     LWLockRelease(newPartitionLock);
    2262              : 
    2263              :     /*
    2264              :      * Acquire a victim buffer. Somebody else might try to do the same, we
    2265              :      * don't hold any conflicting locks. If so we'll have to undo our work
    2266              :      * later.
    2267              :      */
    2268      1956216 :     victim_buffer = GetVictimBuffer(strategy, io_context);
    2269      1956216 :     victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
    2270              : 
    2271              :     /*
    2272              :      * Try to make a hashtable entry for the buffer under its new tag. If
    2273              :      * somebody else inserted another buffer for the tag, we'll release the
    2274              :      * victim buffer we acquired and use the already inserted one.
    2275              :      */
    2276      1956216 :     LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
    2277      1956216 :     existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
    2278      1956216 :     if (existing_buf_id >= 0)
    2279              :     {
    2280              :         BufferDesc *existing_buf_hdr;
    2281              :         bool        valid;
    2282              : 
    2283              :         /*
    2284              :          * Got a collision. Someone has already done what we were about to do.
    2285              :          * We'll just handle this as if it were found in the buffer pool in
    2286              :          * the first place.  First, give up the buffer we were planning to
    2287              :          * use.
    2288              :          *
    2289              :          * We could do this after releasing the partition lock, but then we'd
    2290              :          * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
    2291              :          * before acquiring the lock, for the rare case of such a collision.
    2292              :          */
    2293          849 :         UnpinBuffer(victim_buf_hdr);
    2294              : 
    2295              :         /* remaining code should match code at top of routine */
    2296              : 
    2297          849 :         existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
    2298              : 
    2299          849 :         valid = PinBuffer(existing_buf_hdr, strategy, false);
    2300              : 
    2301              :         /* Can release the mapping lock as soon as we've pinned it */
    2302          849 :         LWLockRelease(newPartitionLock);
    2303              : 
    2304          849 :         *foundPtr = true;
    2305              : 
    2306          849 :         if (!valid)
    2307              :         {
    2308              :             /*
    2309              :              * We can only get here if (a) someone else is still reading in
    2310              :              * the page, (b) a previous read attempt failed, or (c) someone
    2311              :              * called StartReadBuffers() but not yet WaitReadBuffers().
    2312              :              */
    2313          560 :             *foundPtr = false;
    2314              :         }
    2315              : 
    2316          849 :         return existing_buf_hdr;
    2317              :     }
    2318              : 
    2319              :     /*
    2320              :      * Need to lock the buffer header too in order to change its tag.
    2321              :      */
    2322      1955367 :     victim_buf_state = LockBufHdr(victim_buf_hdr);
    2323              : 
    2324              :     /* some sanity checks while we hold the buffer header lock */
    2325              :     Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
    2326              :     Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
    2327              : 
    2328      1955367 :     victim_buf_hdr->tag = newTag;
    2329              : 
    2330              :     /*
    2331              :      * Make sure BM_PERMANENT is set for buffers that must be written at every
    2332              :      * checkpoint.  Unlogged buffers only need to be written at shutdown
    2333              :      * checkpoints, except for their "init" forks, which need to be treated
    2334              :      * just like permanent relations.
    2335              :      */
    2336      1955367 :     set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
    2337      1955367 :     if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
    2338      1954986 :         set_bits |= BM_PERMANENT;
    2339              : 
    2340      1955367 :     UnlockBufHdrExt(victim_buf_hdr, victim_buf_state,
    2341              :                     set_bits, 0, 0);
    2342              : 
    2343      1955367 :     LWLockRelease(newPartitionLock);
    2344              : 
    2345              :     /*
    2346              :      * Buffer contents are currently invalid.
    2347              :      */
    2348      1955367 :     *foundPtr = false;
    2349              : 
    2350      1955367 :     return victim_buf_hdr;
    2351              : }
    2352              : 
    2353              : /*
    2354              :  * InvalidateBuffer -- mark a shared buffer invalid.
    2355              :  *
    2356              :  * The buffer header spinlock must be held at entry.  We drop it before
    2357              :  * returning.  (This is sane because the caller must have locked the
    2358              :  * buffer in order to be sure it should be dropped.)
    2359              :  *
    2360              :  * This is used only in contexts such as dropping a relation.  We assume
    2361              :  * that no other backend could possibly be interested in using the page,
    2362              :  * so the only reason the buffer might be pinned is if someone else is
    2363              :  * trying to write it out.  We have to let them finish before we can
    2364              :  * reclaim the buffer.
    2365              :  *
    2366              :  * The buffer could get reclaimed by someone else while we are waiting
    2367              :  * to acquire the necessary locks; if so, don't mess it up.
    2368              :  */
    2369              : static void
    2370       127003 : InvalidateBuffer(BufferDesc *buf)
    2371              : {
    2372              :     BufferTag   oldTag;
    2373              :     uint32      oldHash;        /* hash value for oldTag */
    2374              :     LWLock     *oldPartitionLock;   /* buffer partition lock for it */
    2375              :     uint32      oldFlags;
    2376              :     uint64      buf_state;
    2377              : 
    2378              :     /* Save the original buffer tag before dropping the spinlock */
    2379       127003 :     oldTag = buf->tag;
    2380              : 
    2381       127003 :     UnlockBufHdr(buf);
    2382              : 
    2383              :     /*
    2384              :      * Need to compute the old tag's hashcode and partition lock ID. XXX is it
    2385              :      * worth storing the hashcode in BufferDesc so we need not recompute it
    2386              :      * here?  Probably not.
    2387              :      */
    2388       127003 :     oldHash = BufTableHashCode(&oldTag);
    2389       127003 :     oldPartitionLock = BufMappingPartitionLock(oldHash);
    2390              : 
    2391       127004 : retry:
    2392              : 
    2393              :     /*
    2394              :      * Acquire exclusive mapping lock in preparation for changing the buffer's
    2395              :      * association.
    2396              :      */
    2397       127004 :     LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
    2398              : 
    2399              :     /* Re-lock the buffer header */
    2400       127004 :     buf_state = LockBufHdr(buf);
    2401              : 
    2402              :     /* If it's changed while we were waiting for lock, do nothing */
    2403       127004 :     if (!BufferTagsEqual(&buf->tag, &oldTag))
    2404              :     {
    2405            1 :         UnlockBufHdr(buf);
    2406            1 :         LWLockRelease(oldPartitionLock);
    2407            1 :         return;
    2408              :     }
    2409              : 
    2410              :     /*
    2411              :      * We assume the reason for it to be pinned is that either we were
    2412              :      * asynchronously reading the page in before erroring out or someone else
    2413              :      * is flushing the page out.  Wait for the IO to finish.  (This could be
    2414              :      * an infinite loop if the refcount is messed up... it would be nice to
    2415              :      * time out after awhile, but there seems no way to be sure how many loops
    2416              :      * may be needed.  Note that if the other guy has pinned the buffer but
    2417              :      * not yet done StartBufferIO, WaitIO will fall through and we'll
    2418              :      * effectively be busy-looping here.)
    2419              :      */
    2420       127003 :     if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
    2421              :     {
    2422            1 :         UnlockBufHdr(buf);
    2423            1 :         LWLockRelease(oldPartitionLock);
    2424              :         /* safety check: should definitely not be our *own* pin */
    2425            1 :         if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
    2426            0 :             elog(ERROR, "buffer is pinned in InvalidateBuffer");
    2427            1 :         WaitIO(buf);
    2428            1 :         goto retry;
    2429              :     }
    2430              : 
    2431              :     /*
    2432              :      * An invalidated buffer should not have any backends waiting to lock the
    2433              :      * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
    2434              :      */
    2435              :     Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
    2436              : 
    2437              :     /*
    2438              :      * Clear out the buffer's tag and flags.  We must do this to ensure that
    2439              :      * linear scans of the buffer array don't think the buffer is valid.
    2440              :      */
    2441       127002 :     oldFlags = buf_state & BUF_FLAG_MASK;
    2442       127002 :     ClearBufferTag(&buf->tag);
    2443              : 
    2444       127002 :     UnlockBufHdrExt(buf, buf_state,
    2445              :                     0,
    2446              :                     BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
    2447              :                     0);
    2448              : 
    2449              :     /*
    2450              :      * Remove the buffer from the lookup hashtable, if it was in there.
    2451              :      */
    2452       127002 :     if (oldFlags & BM_TAG_VALID)
    2453       127002 :         BufTableDelete(&oldTag, oldHash);
    2454              : 
    2455              :     /*
    2456              :      * Done with mapping lock.
    2457              :      */
    2458       127002 :     LWLockRelease(oldPartitionLock);
    2459              : }
    2460              : 
    2461              : /*
    2462              :  * Helper routine for GetVictimBuffer()
    2463              :  *
    2464              :  * Needs to be called on a buffer with a valid tag, pinned, but without the
    2465              :  * buffer header spinlock held.
    2466              :  *
    2467              :  * Returns true if the buffer can be reused, in which case the buffer is only
    2468              :  * pinned by this backend and marked as invalid, false otherwise.
    2469              :  */
    2470              : static bool
    2471      1394829 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
    2472              : {
    2473              :     uint64      buf_state;
    2474              :     uint32      hash;
    2475              :     LWLock     *partition_lock;
    2476              :     BufferTag   tag;
    2477              : 
    2478              :     Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
    2479              : 
    2480              :     /* have buffer pinned, so it's safe to read tag without lock */
    2481      1394829 :     tag = buf_hdr->tag;
    2482              : 
    2483      1394829 :     hash = BufTableHashCode(&tag);
    2484      1394829 :     partition_lock = BufMappingPartitionLock(hash);
    2485              : 
    2486      1394829 :     LWLockAcquire(partition_lock, LW_EXCLUSIVE);
    2487              : 
    2488              :     /* lock the buffer header */
    2489      1394829 :     buf_state = LockBufHdr(buf_hdr);
    2490              : 
    2491              :     /*
    2492              :      * We have the buffer pinned nobody else should have been able to unset
    2493              :      * this concurrently.
    2494              :      */
    2495              :     Assert(buf_state & BM_TAG_VALID);
    2496              :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2497              :     Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
    2498              : 
    2499              :     /*
    2500              :      * If somebody else pinned the buffer since, or even worse, dirtied it,
    2501              :      * give up on this buffer: It's clearly in use.
    2502              :      */
    2503      1394829 :     if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
    2504              :     {
    2505              :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2506              : 
    2507          395 :         UnlockBufHdr(buf_hdr);
    2508          395 :         LWLockRelease(partition_lock);
    2509              : 
    2510          395 :         return false;
    2511              :     }
    2512              : 
    2513              :     /*
    2514              :      * An invalidated buffer should not have any backends waiting to lock the
    2515              :      * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
    2516              :      */
    2517              :     Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
    2518              : 
    2519              :     /*
    2520              :      * Clear out the buffer's tag and flags and usagecount.  This is not
    2521              :      * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
    2522              :      * doing anything with the buffer. But currently it's beneficial, as the
    2523              :      * cheaper pre-check for several linear scans of shared buffers use the
    2524              :      * tag (see e.g. FlushDatabaseBuffers()).
    2525              :      */
    2526      1394434 :     ClearBufferTag(&buf_hdr->tag);
    2527      1394434 :     UnlockBufHdrExt(buf_hdr, buf_state,
    2528              :                     0,
    2529              :                     BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
    2530              :                     0);
    2531              : 
    2532              :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2533              : 
    2534              :     /* finally delete buffer from the buffer mapping table */
    2535      1394434 :     BufTableDelete(&tag, hash);
    2536              : 
    2537      1394434 :     LWLockRelease(partition_lock);
    2538              : 
    2539      1394434 :     buf_state = pg_atomic_read_u64(&buf_hdr->state);
    2540              :     Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
    2541              :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2542              :     Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u64(&buf_hdr->state)) > 0);
    2543              : 
    2544      1394434 :     return true;
    2545              : }
    2546              : 
    2547              : static Buffer
    2548      2236828 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
    2549              : {
    2550              :     BufferDesc *buf_hdr;
    2551              :     Buffer      buf;
    2552              :     uint64      buf_state;
    2553              :     bool        from_ring;
    2554              : 
    2555              :     /*
    2556              :      * Ensure, before we pin a victim buffer, that there's a free refcount
    2557              :      * entry and resource owner slot for the pin.
    2558              :      */
    2559      2236828 :     ReservePrivateRefCountEntry();
    2560      2236828 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    2561              : 
    2562              :     /* we return here if a prospective victim buffer gets used concurrently */
    2563        22491 : again:
    2564              : 
    2565              :     /*
    2566              :      * Select a victim buffer.  The buffer is returned pinned and owned by
    2567              :      * this backend.
    2568              :      */
    2569      2259319 :     buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
    2570      2259319 :     buf = BufferDescriptorGetBuffer(buf_hdr);
    2571              : 
    2572              :     /*
    2573              :      * We shouldn't have any other pins for this buffer.
    2574              :      */
    2575      2259319 :     CheckBufferIsPinnedOnce(buf);
    2576              : 
    2577              :     /*
    2578              :      * If the buffer was dirty, try to write it out.  There is a race
    2579              :      * condition here, another backend could dirty the buffer between
    2580              :      * StrategyGetBuffer() checking that it is not in use and invalidating the
    2581              :      * buffer below. That's addressed by InvalidateVictimBuffer() verifying
    2582              :      * that the buffer is not dirty.
    2583              :      */
    2584      2259319 :     if (buf_state & BM_DIRTY)
    2585              :     {
    2586              :         Assert(buf_state & BM_TAG_VALID);
    2587              :         Assert(buf_state & BM_VALID);
    2588              : 
    2589              :         /*
    2590              :          * We need a share-exclusive lock on the buffer contents to write it
    2591              :          * out (else we might write invalid data, eg because someone else is
    2592              :          * compacting the page contents while we write).  We must use a
    2593              :          * conditional lock acquisition here to avoid deadlock.  Even though
    2594              :          * the buffer was not pinned (and therefore surely not locked) when
    2595              :          * StrategyGetBuffer returned it, someone else could have pinned and
    2596              :          * (share-)exclusive-locked it by the time we get here. If we try to
    2597              :          * get the lock unconditionally, we'd block waiting for them; if they
    2598              :          * later block waiting for us, deadlock ensues. (This has been
    2599              :          * observed to happen when two backends are both trying to split btree
    2600              :          * index pages, and the second one just happens to be trying to split
    2601              :          * the page the first one got from StrategyGetBuffer.)
    2602              :          */
    2603       374520 :         if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE))
    2604              :         {
    2605              :             /*
    2606              :              * Someone else has locked the buffer, so give it up and loop back
    2607              :              * to get another one.
    2608              :              */
    2609            0 :             UnpinBuffer(buf_hdr);
    2610            0 :             goto again;
    2611              :         }
    2612              : 
    2613              :         /*
    2614              :          * If using a nondefault strategy, and this victim came from the
    2615              :          * strategy ring, let the strategy decide whether to reject it when
    2616              :          * reusing it would require a WAL flush.  This only applies to
    2617              :          * permanent buffers; unlogged buffers can have fake LSNs, so
    2618              :          * XLogNeedsFlush() is not meaningful for them.
    2619              :          *
    2620              :          * We need to hold the content lock in at least share-exclusive mode
    2621              :          * to safely inspect the page LSN, so this couldn't have been done
    2622              :          * inside StrategyGetBuffer().
    2623              :          */
    2624       374520 :         if (strategy && from_ring &&
    2625       194760 :             buf_state & BM_PERMANENT &&
    2626       124108 :             XLogNeedsFlush(BufferGetLSN(buf_hdr)) &&
    2627        26736 :             StrategyRejectBuffer(strategy, buf_hdr, from_ring))
    2628              :         {
    2629        22096 :             UnlockReleaseBuffer(buf);
    2630        22096 :             goto again;
    2631              :         }
    2632              : 
    2633              :         /* OK, do the I/O */
    2634       352424 :         FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
    2635       352424 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    2636              : 
    2637       352424 :         ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
    2638              :                                       &buf_hdr->tag);
    2639              :     }
    2640              : 
    2641              : 
    2642      2237223 :     if (buf_state & BM_VALID)
    2643              :     {
    2644              :         /*
    2645              :          * When a BufferAccessStrategy is in use, blocks evicted from shared
    2646              :          * buffers are counted as IOOP_EVICT in the corresponding context
    2647              :          * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
    2648              :          * strategy in two cases: 1) while initially claiming buffers for the
    2649              :          * strategy ring 2) to replace an existing strategy ring buffer
    2650              :          * because it is pinned or in use and cannot be reused.
    2651              :          *
    2652              :          * Blocks evicted from buffers already in the strategy ring are
    2653              :          * counted as IOOP_REUSE in the corresponding strategy context.
    2654              :          *
    2655              :          * At this point, we can accurately count evictions and reuses,
    2656              :          * because we have successfully claimed the valid buffer. Previously,
    2657              :          * we may have been forced to release the buffer due to concurrent
    2658              :          * pinners or erroring out.
    2659              :          */
    2660      1392320 :         pgstat_count_io_op(IOOBJECT_RELATION, io_context,
    2661      1392320 :                            from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
    2662              :     }
    2663              : 
    2664              :     /*
    2665              :      * If the buffer has an entry in the buffer mapping table, delete it. This
    2666              :      * can fail because another backend could have pinned or dirtied the
    2667              :      * buffer.
    2668              :      */
    2669      2237223 :     if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
    2670              :     {
    2671          395 :         UnpinBuffer(buf_hdr);
    2672          395 :         goto again;
    2673              :     }
    2674              : 
    2675              :     /* a final set of sanity checks */
    2676              : #ifdef USE_ASSERT_CHECKING
    2677              :     buf_state = pg_atomic_read_u64(&buf_hdr->state);
    2678              : 
    2679              :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
    2680              :     Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
    2681              : 
    2682              :     CheckBufferIsPinnedOnce(buf);
    2683              : #endif
    2684              : 
    2685      2236828 :     return buf;
    2686              : }
    2687              : 
    2688              : /*
    2689              :  * Return the maximum number of buffers that a backend should try to pin once,
    2690              :  * to avoid exceeding its fair share.  This is the highest value that
    2691              :  * GetAdditionalPinLimit() could ever return.  Note that it may be zero on a
    2692              :  * system with a very small buffer pool relative to max_connections.
    2693              :  */
    2694              : uint32
    2695       779113 : GetPinLimit(void)
    2696              : {
    2697       779113 :     return MaxProportionalPins;
    2698              : }
    2699              : 
    2700              : /*
    2701              :  * Return the maximum number of additional buffers that this backend should
    2702              :  * pin if it wants to stay under the per-backend limit, considering the number
    2703              :  * of buffers it has already pinned.  Unlike LimitAdditionalPins(), the limit
    2704              :  * return by this function can be zero.
    2705              :  */
    2706              : uint32
    2707      4327648 : GetAdditionalPinLimit(void)
    2708              : {
    2709              :     uint32      estimated_pins_held;
    2710              : 
    2711              :     /*
    2712              :      * We get the number of "overflowed" pins for free, but don't know the
    2713              :      * number of pins in PrivateRefCountArray.  The cost of calculating that
    2714              :      * exactly doesn't seem worth it, so just assume the max.
    2715              :      */
    2716      4327648 :     estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
    2717              : 
    2718              :     /* Is this backend already holding more than its fair share? */
    2719      4327648 :     if (estimated_pins_held > MaxProportionalPins)
    2720      1484796 :         return 0;
    2721              : 
    2722      2842852 :     return MaxProportionalPins - estimated_pins_held;
    2723              : }
    2724              : 
    2725              : /*
    2726              :  * Limit the number of pins a batch operation may additionally acquire, to
    2727              :  * avoid running out of pinnable buffers.
    2728              :  *
    2729              :  * One additional pin is always allowed, on the assumption that the operation
    2730              :  * requires at least one to make progress.
    2731              :  */
    2732              : void
    2733       253801 : LimitAdditionalPins(uint32 *additional_pins)
    2734              : {
    2735              :     uint32      limit;
    2736              : 
    2737       253801 :     if (*additional_pins <= 1)
    2738       238289 :         return;
    2739              : 
    2740        15512 :     limit = GetAdditionalPinLimit();
    2741        15512 :     limit = Max(limit, 1);
    2742        15512 :     if (limit < *additional_pins)
    2743        10042 :         *additional_pins = limit;
    2744              : }
    2745              : 
    2746              : /*
    2747              :  * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
    2748              :  * avoid duplicating the tracing and relpersistence related logic.
    2749              :  */
    2750              : static BlockNumber
    2751       268736 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
    2752              :                         ForkNumber fork,
    2753              :                         BufferAccessStrategy strategy,
    2754              :                         uint32 flags,
    2755              :                         uint32 extend_by,
    2756              :                         BlockNumber extend_upto,
    2757              :                         Buffer *buffers,
    2758              :                         uint32 *extended_by)
    2759              : {
    2760              :     BlockNumber first_block;
    2761              : 
    2762              :     TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
    2763              :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
    2764              :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
    2765              :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
    2766              :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
    2767              :                                          extend_by);
    2768              : 
    2769       268736 :     if (bmr.relpersistence == RELPERSISTENCE_TEMP)
    2770        14935 :         first_block = ExtendBufferedRelLocal(bmr, fork, flags,
    2771              :                                              extend_by, extend_upto,
    2772              :                                              buffers, &extend_by);
    2773              :     else
    2774       253801 :         first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
    2775              :                                               extend_by, extend_upto,
    2776              :                                               buffers, &extend_by);
    2777       268736 :     *extended_by = extend_by;
    2778              : 
    2779              :     TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
    2780              :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
    2781              :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
    2782              :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
    2783              :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
    2784              :                                         *extended_by,
    2785              :                                         first_block);
    2786              : 
    2787       268736 :     return first_block;
    2788              : }
    2789              : 
    2790              : /*
    2791              :  * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
    2792              :  * shared buffers.
    2793              :  */
    2794              : static BlockNumber
    2795       253801 : ExtendBufferedRelShared(BufferManagerRelation bmr,
    2796              :                         ForkNumber fork,
    2797              :                         BufferAccessStrategy strategy,
    2798              :                         uint32 flags,
    2799              :                         uint32 extend_by,
    2800              :                         BlockNumber extend_upto,
    2801              :                         Buffer *buffers,
    2802              :                         uint32 *extended_by)
    2803              : {
    2804              :     BlockNumber first_block;
    2805       253801 :     IOContext   io_context = IOContextForStrategy(strategy);
    2806              :     instr_time  io_start;
    2807              : 
    2808       253801 :     LimitAdditionalPins(&extend_by);
    2809              : 
    2810              :     /*
    2811              :      * Acquire victim buffers for extension without holding extension lock.
    2812              :      * Writing out victim buffers is the most expensive part of extending the
    2813              :      * relation, particularly when doing so requires WAL flushes. Zeroing out
    2814              :      * the buffers is also quite expensive, so do that before holding the
    2815              :      * extension lock as well.
    2816              :      *
    2817              :      * These pages are pinned by us and not valid. While we hold the pin they
    2818              :      * can't be acquired as victim buffers by another backend.
    2819              :      */
    2820       534413 :     for (uint32 i = 0; i < extend_by; i++)
    2821              :     {
    2822              :         Block       buf_block;
    2823              : 
    2824       280612 :         buffers[i] = GetVictimBuffer(strategy, io_context);
    2825       280612 :         buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
    2826              : 
    2827              :         /* new buffers are zero-filled */
    2828       280612 :         MemSet(buf_block, 0, BLCKSZ);
    2829              :     }
    2830              : 
    2831              :     /*
    2832              :      * Lock relation against concurrent extensions, unless requested not to.
    2833              :      *
    2834              :      * We use the same extension lock for all forks. That's unnecessarily
    2835              :      * restrictive, but currently extensions for forks don't happen often
    2836              :      * enough to make it worth locking more granularly.
    2837              :      *
    2838              :      * Note that another backend might have extended the relation by the time
    2839              :      * we get the lock.
    2840              :      */
    2841       253801 :     if (!(flags & EB_SKIP_EXTENSION_LOCK))
    2842       198419 :         LockRelationForExtension(bmr.rel, ExclusiveLock);
    2843              : 
    2844              :     /*
    2845              :      * If requested, invalidate size cache, so that smgrnblocks asks the
    2846              :      * kernel.
    2847              :      */
    2848       253801 :     if (flags & EB_CLEAR_SIZE_CACHE)
    2849         9966 :         BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
    2850              : 
    2851       253801 :     first_block = smgrnblocks(BMR_GET_SMGR(bmr), fork);
    2852              : 
    2853              :     /*
    2854              :      * Now that we have the accurate relation size, check if the caller wants
    2855              :      * us to extend to only up to a specific size. If there were concurrent
    2856              :      * extensions, we might have acquired too many buffers and need to release
    2857              :      * them.
    2858              :      */
    2859       253801 :     if (extend_upto != InvalidBlockNumber)
    2860              :     {
    2861        57546 :         uint32      orig_extend_by = extend_by;
    2862              : 
    2863        57546 :         if (first_block > extend_upto)
    2864            0 :             extend_by = 0;
    2865        57546 :         else if ((uint64) first_block + extend_by > extend_upto)
    2866            8 :             extend_by = extend_upto - first_block;
    2867              : 
    2868        57564 :         for (uint32 i = extend_by; i < orig_extend_by; i++)
    2869              :         {
    2870           18 :             BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
    2871              : 
    2872           18 :             UnpinBuffer(buf_hdr);
    2873              :         }
    2874              : 
    2875        57546 :         if (extend_by == 0)
    2876              :         {
    2877            8 :             if (!(flags & EB_SKIP_EXTENSION_LOCK))
    2878            8 :                 UnlockRelationForExtension(bmr.rel, ExclusiveLock);
    2879            8 :             *extended_by = extend_by;
    2880            8 :             return first_block;
    2881              :         }
    2882              :     }
    2883              : 
    2884              :     /* Fail if relation is already at maximum possible length */
    2885       253793 :     if ((uint64) first_block + extend_by >= MaxBlockNumber)
    2886            0 :         ereport(ERROR,
    2887              :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    2888              :                  errmsg("cannot extend relation %s beyond %u blocks",
    2889              :                         relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
    2890              :                         MaxBlockNumber)));
    2891              : 
    2892              :     /*
    2893              :      * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
    2894              :      *
    2895              :      * This needs to happen before we extend the relation, because as soon as
    2896              :      * we do, other backends can start to read in those pages.
    2897              :      */
    2898       534387 :     for (uint32 i = 0; i < extend_by; i++)
    2899              :     {
    2900       280594 :         Buffer      victim_buf = buffers[i];
    2901       280594 :         BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
    2902              :         BufferTag   tag;
    2903              :         uint32      hash;
    2904              :         LWLock     *partition_lock;
    2905              :         int         existing_id;
    2906              : 
    2907              :         /* in case we need to pin an existing buffer below */
    2908       280594 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    2909       280594 :         ReservePrivateRefCountEntry();
    2910              : 
    2911       280594 :         InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
    2912              :                       first_block + i);
    2913       280594 :         hash = BufTableHashCode(&tag);
    2914       280594 :         partition_lock = BufMappingPartitionLock(hash);
    2915              : 
    2916       280594 :         LWLockAcquire(partition_lock, LW_EXCLUSIVE);
    2917              : 
    2918       280594 :         existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
    2919              : 
    2920              :         /*
    2921              :          * We get here only in the corner case where we are trying to extend
    2922              :          * the relation but we found a pre-existing buffer. This can happen
    2923              :          * because a prior attempt at extending the relation failed, and
    2924              :          * because mdread doesn't complain about reads beyond EOF (when
    2925              :          * zero_damaged_pages is ON) and so a previous attempt to read a block
    2926              :          * beyond EOF could have left a "valid" zero-filled buffer.
    2927              :          *
    2928              :          * This has also been observed when relation was overwritten by
    2929              :          * external process. Since the legitimate cases should always have
    2930              :          * left a zero-filled buffer, complain if not PageIsNew.
    2931              :          */
    2932       280594 :         if (existing_id >= 0)
    2933              :         {
    2934            0 :             BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
    2935              :             Block       buf_block;
    2936              :             bool        valid;
    2937              : 
    2938              :             /*
    2939              :              * Pin the existing buffer before releasing the partition lock,
    2940              :              * preventing it from being evicted.
    2941              :              */
    2942            0 :             valid = PinBuffer(existing_hdr, strategy, false);
    2943              : 
    2944            0 :             LWLockRelease(partition_lock);
    2945            0 :             UnpinBuffer(victim_buf_hdr);
    2946              : 
    2947            0 :             buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
    2948            0 :             buf_block = BufHdrGetBlock(existing_hdr);
    2949              : 
    2950            0 :             if (valid && !PageIsNew((Page) buf_block))
    2951            0 :                 ereport(ERROR,
    2952              :                         (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
    2953              :                                 existing_hdr->tag.blockNum,
    2954              :                                 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
    2955              : 
    2956              :             /*
    2957              :              * We *must* do smgr[zero]extend before succeeding, else the page
    2958              :              * will not be reserved by the kernel, and the next P_NEW call
    2959              :              * will decide to return the same page.  Clear the BM_VALID bit,
    2960              :              * do StartSharedBufferIO() and proceed.
    2961              :              *
    2962              :              * Loop to handle the very small possibility that someone re-sets
    2963              :              * BM_VALID between our clearing it and StartSharedBufferIO
    2964              :              * inspecting it.
    2965              :              */
    2966              :             while (true)
    2967            0 :             {
    2968              :                 StartBufferIOResult sbres;
    2969              : 
    2970            0 :                 pg_atomic_fetch_and_u64(&existing_hdr->state, ~BM_VALID);
    2971              : 
    2972            0 :                 sbres = StartSharedBufferIO(existing_hdr, true, true, NULL);
    2973              : 
    2974            0 :                 if (sbres != BUFFER_IO_ALREADY_DONE)
    2975            0 :                     break;
    2976              :             }
    2977              :         }
    2978              :         else
    2979              :         {
    2980              :             uint64      buf_state;
    2981       280594 :             uint64      set_bits = 0;
    2982              : 
    2983       280594 :             buf_state = LockBufHdr(victim_buf_hdr);
    2984              : 
    2985              :             /* some sanity checks while we hold the buffer header lock */
    2986              :             Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY)));
    2987              :             Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
    2988              : 
    2989       280594 :             victim_buf_hdr->tag = tag;
    2990              : 
    2991       280594 :             set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
    2992       280594 :             if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
    2993       275171 :                 set_bits |= BM_PERMANENT;
    2994              : 
    2995       280594 :             UnlockBufHdrExt(victim_buf_hdr, buf_state,
    2996              :                             set_bits, 0,
    2997              :                             0);
    2998              : 
    2999       280594 :             LWLockRelease(partition_lock);
    3000              : 
    3001              :             /* XXX: could combine the locked operations in it with the above */
    3002       280594 :             StartSharedBufferIO(victim_buf_hdr, true, true, NULL);
    3003              :         }
    3004              :     }
    3005              : 
    3006       253793 :     io_start = pgstat_prepare_io_time(track_io_timing);
    3007              : 
    3008              :     /*
    3009              :      * Note: if smgrzeroextend fails, we will end up with buffers that are
    3010              :      * allocated but not marked BM_VALID.  The next relation extension will
    3011              :      * still select the same block number (because the relation didn't get any
    3012              :      * longer on disk) and so future attempts to extend the relation will find
    3013              :      * the same buffers (if they have not been recycled) but come right back
    3014              :      * here to try smgrzeroextend again.
    3015              :      *
    3016              :      * We don't need to set checksum for all-zero pages.
    3017              :      */
    3018       253793 :     smgrzeroextend(BMR_GET_SMGR(bmr), fork, first_block, extend_by, false);
    3019              : 
    3020              :     /*
    3021              :      * Release the file-extension lock; it's now OK for someone else to extend
    3022              :      * the relation some more.
    3023              :      *
    3024              :      * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
    3025              :      * take noticeable time.
    3026              :      */
    3027       253793 :     if (!(flags & EB_SKIP_EXTENSION_LOCK))
    3028       198411 :         UnlockRelationForExtension(bmr.rel, ExclusiveLock);
    3029              : 
    3030       253793 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
    3031       253793 :                             io_start, 1, extend_by * BLCKSZ);
    3032              : 
    3033              :     /* Set BM_VALID, terminate IO, and wake up any waiters */
    3034       534387 :     for (uint32 i = 0; i < extend_by; i++)
    3035              :     {
    3036       280594 :         Buffer      buf = buffers[i];
    3037       280594 :         BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
    3038       280594 :         bool        lock = false;
    3039              : 
    3040       280594 :         if (flags & EB_LOCK_FIRST && i == 0)
    3041       195927 :             lock = true;
    3042        84667 :         else if (flags & EB_LOCK_TARGET)
    3043              :         {
    3044              :             Assert(extend_upto != InvalidBlockNumber);
    3045        46446 :             if (first_block + i + 1 == extend_upto)
    3046        45877 :                 lock = true;
    3047              :         }
    3048              : 
    3049       280594 :         if (lock)
    3050       241804 :             LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
    3051              : 
    3052       280594 :         TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
    3053              :     }
    3054              : 
    3055       253793 :     pgBufferUsage.shared_blks_written += extend_by;
    3056              : 
    3057       253793 :     *extended_by = extend_by;
    3058              : 
    3059       253793 :     return first_block;
    3060              : }
    3061              : 
    3062              : /*
    3063              :  * BufferIsLockedByMe
    3064              :  *
    3065              :  *      Checks if this backend has the buffer locked in any mode.
    3066              :  *
    3067              :  * Buffer must be pinned.
    3068              :  */
    3069              : bool
    3070            0 : BufferIsLockedByMe(Buffer buffer)
    3071              : {
    3072              :     BufferDesc *bufHdr;
    3073              : 
    3074              :     Assert(BufferIsPinned(buffer));
    3075              : 
    3076            0 :     if (BufferIsLocal(buffer))
    3077              :     {
    3078              :         /* Content locks are not maintained for local buffers. */
    3079            0 :         return true;
    3080              :     }
    3081              :     else
    3082              :     {
    3083            0 :         bufHdr = GetBufferDescriptor(buffer - 1);
    3084            0 :         return BufferLockHeldByMe(bufHdr);
    3085              :     }
    3086              : }
    3087              : 
    3088              : /*
    3089              :  * BufferIsLockedByMeInMode
    3090              :  *
    3091              :  *      Checks if this backend has the buffer locked in the specified mode.
    3092              :  *
    3093              :  * Buffer must be pinned.
    3094              :  */
    3095              : bool
    3096            0 : BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
    3097              : {
    3098              :     BufferDesc *bufHdr;
    3099              : 
    3100              :     Assert(BufferIsPinned(buffer));
    3101              : 
    3102            0 :     if (BufferIsLocal(buffer))
    3103              :     {
    3104              :         /* Content locks are not maintained for local buffers. */
    3105            0 :         return true;
    3106              :     }
    3107              :     else
    3108              :     {
    3109            0 :         bufHdr = GetBufferDescriptor(buffer - 1);
    3110            0 :         return BufferLockHeldByMeInMode(bufHdr, mode);
    3111              :     }
    3112              : }
    3113              : 
    3114              : /*
    3115              :  * BufferIsDirty
    3116              :  *
    3117              :  *      Checks if buffer is already dirty.
    3118              :  *
    3119              :  * Buffer must be pinned and [share-]exclusive-locked.  (Without such a lock,
    3120              :  * the result may be stale before it's returned.)
    3121              :  */
    3122              : bool
    3123        30578 : BufferIsDirty(Buffer buffer)
    3124              : {
    3125              :     BufferDesc *bufHdr;
    3126              : 
    3127              :     Assert(BufferIsPinned(buffer));
    3128              : 
    3129        30578 :     if (BufferIsLocal(buffer))
    3130              :     {
    3131         9523 :         int         bufid = -buffer - 1;
    3132              : 
    3133         9523 :         bufHdr = GetLocalBufferDescriptor(bufid);
    3134              :         /* Content locks are not maintained for local buffers. */
    3135              :     }
    3136              :     else
    3137              :     {
    3138        21055 :         bufHdr = GetBufferDescriptor(buffer - 1);
    3139              :         Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE) ||
    3140              :                BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
    3141              :     }
    3142              : 
    3143        30578 :     return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY;
    3144              : }
    3145              : 
    3146              : /*
    3147              :  * MarkBufferDirty
    3148              :  *
    3149              :  *      Marks buffer contents as dirty (actual write happens later).
    3150              :  *
    3151              :  * Buffer must be pinned and exclusive-locked.  (If caller does not hold
    3152              :  * exclusive lock, then somebody could be in process of writing the buffer,
    3153              :  * leading to risk of bad data written to disk.)
    3154              :  */
    3155              : void
    3156     34554098 : MarkBufferDirty(Buffer buffer)
    3157              : {
    3158              :     BufferDesc *bufHdr;
    3159              :     uint64      buf_state;
    3160              :     uint64      old_buf_state;
    3161              : 
    3162     34554098 :     if (!BufferIsValid(buffer))
    3163            0 :         elog(ERROR, "bad buffer ID: %d", buffer);
    3164              : 
    3165     34554098 :     if (BufferIsLocal(buffer))
    3166              :     {
    3167      1579835 :         MarkLocalBufferDirty(buffer);
    3168      1579835 :         return;
    3169              :     }
    3170              : 
    3171     32974263 :     bufHdr = GetBufferDescriptor(buffer - 1);
    3172              : 
    3173              :     Assert(BufferIsPinned(buffer));
    3174              :     Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
    3175              : 
    3176              :     /*
    3177              :      * NB: We have to wait for the buffer header spinlock to be not held, as
    3178              :      * TerminateBufferIO() relies on the spinlock.
    3179              :      */
    3180     32974263 :     old_buf_state = pg_atomic_read_u64(&bufHdr->state);
    3181              :     for (;;)
    3182              :     {
    3183     32974589 :         if (old_buf_state & BM_LOCKED)
    3184          462 :             old_buf_state = WaitBufHdrUnlocked(bufHdr);
    3185              : 
    3186     32974589 :         buf_state = old_buf_state;
    3187              : 
    3188              :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    3189     32974589 :         buf_state |= BM_DIRTY;
    3190              : 
    3191     32974589 :         if (pg_atomic_compare_exchange_u64(&bufHdr->state, &old_buf_state,
    3192              :                                            buf_state))
    3193     32974263 :             break;
    3194              :     }
    3195              : 
    3196              :     /*
    3197              :      * If the buffer was not dirty already, do vacuum accounting.
    3198              :      */
    3199     32974263 :     if (!(old_buf_state & BM_DIRTY))
    3200              :     {
    3201       801113 :         pgBufferUsage.shared_blks_dirtied++;
    3202       801113 :         if (VacuumCostActive)
    3203         8347 :             VacuumCostBalance += VacuumCostPageDirty;
    3204              :     }
    3205              : }
    3206              : 
    3207              : /*
    3208              :  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
    3209              :  *
    3210              :  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
    3211              :  * compared to calling the two routines separately.  Now it's mainly just
    3212              :  * a convenience function.  However, if the passed buffer is valid and
    3213              :  * already contains the desired block, we just return it as-is; and that
    3214              :  * does save considerable work compared to a full release and reacquire.
    3215              :  *
    3216              :  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
    3217              :  * buffer actually needs to be released.  This case is the same as ReadBuffer,
    3218              :  * but can save some tests in the caller.
    3219              :  */
    3220              : Buffer
    3221         3646 : ReleaseAndReadBuffer(Buffer buffer,
    3222              :                      Relation relation,
    3223              :                      BlockNumber blockNum)
    3224              : {
    3225         3646 :     ForkNumber  forkNum = MAIN_FORKNUM;
    3226              :     BufferDesc *bufHdr;
    3227              : 
    3228         3646 :     if (BufferIsValid(buffer))
    3229              :     {
    3230              :         Assert(BufferIsPinned(buffer));
    3231         3646 :         if (BufferIsLocal(buffer))
    3232              :         {
    3233           50 :             bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    3234           50 :             if (bufHdr->tag.blockNum == blockNum &&
    3235            0 :                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
    3236            0 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
    3237            0 :                 return buffer;
    3238           50 :             UnpinLocalBuffer(buffer);
    3239              :         }
    3240              :         else
    3241              :         {
    3242         3596 :             bufHdr = GetBufferDescriptor(buffer - 1);
    3243              :             /* we have pin, so it's ok to examine tag without spinlock */
    3244         3596 :             if (bufHdr->tag.blockNum == blockNum &&
    3245            0 :                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
    3246            0 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
    3247            0 :                 return buffer;
    3248         3596 :             UnpinBuffer(bufHdr);
    3249              :         }
    3250              :     }
    3251              : 
    3252         3646 :     return ReadBuffer(relation, blockNum);
    3253              : }
    3254              : 
    3255              : /*
    3256              :  * PinBuffer -- make buffer unavailable for replacement.
    3257              :  *
    3258              :  * For the default access strategy, the buffer's usage_count is incremented
    3259              :  * when we first pin it; for other strategies we just make sure the usage_count
    3260              :  * isn't zero.  (The idea of the latter is that we don't want synchronized
    3261              :  * heap scans to inflate the count, but we need it to not be zero to discourage
    3262              :  * other backends from stealing buffers from our ring.  As long as we cycle
    3263              :  * through the ring faster than the global clock-sweep cycles, buffers in
    3264              :  * our ring won't be chosen as victims for replacement by other backends.)
    3265              :  *
    3266              :  * This should be applied only to shared buffers, never local ones.
    3267              :  *
    3268              :  * Since buffers are pinned/unpinned very frequently, pin buffers without
    3269              :  * taking the buffer header lock; instead update the state variable in loop of
    3270              :  * CAS operations. Hopefully it's just a single CAS.
    3271              :  *
    3272              :  * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
    3273              :  * must have been done already.
    3274              :  *
    3275              :  * Returns true if buffer is BM_VALID, else false.  This provision allows
    3276              :  * some callers to avoid an extra spinlock cycle.  If skip_if_not_valid is
    3277              :  * true, then a false return value also indicates that the buffer was
    3278              :  * (recently) invalid and has not been pinned.
    3279              :  */
    3280              : static bool
    3281     82007370 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
    3282              :           bool skip_if_not_valid)
    3283              : {
    3284     82007370 :     Buffer      b = BufferDescriptorGetBuffer(buf);
    3285              :     bool        result;
    3286              :     PrivateRefCountEntry *ref;
    3287              : 
    3288              :     Assert(!BufferIsLocal(b));
    3289              :     Assert(ReservedRefCountSlot != -1);
    3290              : 
    3291     82007370 :     ref = GetPrivateRefCountEntry(b, true);
    3292              : 
    3293     82007370 :     if (ref == NULL)
    3294              :     {
    3295              :         uint64      buf_state;
    3296              :         uint64      old_buf_state;
    3297              : 
    3298     74668572 :         old_buf_state = pg_atomic_read_u64(&buf->state);
    3299              :         for (;;)
    3300              :         {
    3301     74691589 :             if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
    3302            6 :                 return false;
    3303              : 
    3304              :             /*
    3305              :              * We're not allowed to increase the refcount while the buffer
    3306              :              * header spinlock is held. Wait for the lock to be released.
    3307              :              */
    3308     74691583 :             if (unlikely(old_buf_state & BM_LOCKED))
    3309              :             {
    3310          171 :                 old_buf_state = WaitBufHdrUnlocked(buf);
    3311              : 
    3312              :                 /* perform checks at the top of the loop again */
    3313          171 :                 continue;
    3314              :             }
    3315              : 
    3316     74691412 :             buf_state = old_buf_state;
    3317              : 
    3318              :             /* increase refcount */
    3319     74691412 :             buf_state += BUF_REFCOUNT_ONE;
    3320              : 
    3321     74691412 :             if (strategy == NULL)
    3322              :             {
    3323              :                 /* Default case: increase usagecount unless already max. */
    3324     73866898 :                 if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
    3325      3770885 :                     buf_state += BUF_USAGECOUNT_ONE;
    3326              :             }
    3327              :             else
    3328              :             {
    3329              :                 /*
    3330              :                  * Ring buffers shouldn't evict others from pool.  Thus we
    3331              :                  * don't make usagecount more than 1.
    3332              :                  */
    3333       824514 :                 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
    3334        38459 :                     buf_state += BUF_USAGECOUNT_ONE;
    3335              :             }
    3336              : 
    3337     74691412 :             if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
    3338              :                                                buf_state))
    3339              :             {
    3340     74668566 :                 result = (buf_state & BM_VALID) != 0;
    3341              : 
    3342     74668566 :                 TrackNewBufferPin(b);
    3343     74668566 :                 break;
    3344              :             }
    3345              :         }
    3346              :     }
    3347              :     else
    3348              :     {
    3349              :         /*
    3350              :          * If we previously pinned the buffer, it is likely to be valid, but
    3351              :          * it may not be if StartReadBuffers() was called and
    3352              :          * WaitReadBuffers() hasn't been called yet.  We'll check by loading
    3353              :          * the flags without locking.  This is racy, but it's OK to return
    3354              :          * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
    3355              :          * it'll see that it's now valid.
    3356              :          *
    3357              :          * Note: We deliberately avoid a Valgrind client request here.
    3358              :          * Individual access methods can optionally superimpose buffer page
    3359              :          * client requests on top of our client requests to enforce that
    3360              :          * buffers are only accessed while locked (and pinned).  It's possible
    3361              :          * that the buffer page is legitimately non-accessible here.  We
    3362              :          * cannot meddle with that.
    3363              :          */
    3364      7338798 :         result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
    3365              : 
    3366              :         Assert(ref->data.refcount > 0);
    3367      7338798 :         ref->data.refcount++;
    3368      7338798 :         ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
    3369              :     }
    3370              : 
    3371     82007364 :     return result;
    3372              : }
    3373              : 
    3374              : /*
    3375              :  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
    3376              :  * The spinlock is released before return.
    3377              :  *
    3378              :  * As this function is called with the spinlock held, the caller has to
    3379              :  * previously call ReservePrivateRefCountEntry() and
    3380              :  * ResourceOwnerEnlarge(CurrentResourceOwner);
    3381              :  *
    3382              :  * Currently, no callers of this function want to modify the buffer's
    3383              :  * usage_count at all, so there's no need for a strategy parameter.
    3384              :  * Also we don't bother with a BM_VALID test (the caller could check that for
    3385              :  * itself).
    3386              :  *
    3387              :  * Also all callers only ever use this function when it's known that the
    3388              :  * buffer can't have a preexisting pin by this backend. That allows us to skip
    3389              :  * searching the private refcount array & hash, which is a boon, because the
    3390              :  * spinlock is still held.
    3391              :  *
    3392              :  * Note: use of this routine is frequently mandatory, not just an optimization
    3393              :  * to save a spin lock/unlock cycle, because we need to pin a buffer before
    3394              :  * its state can change under us.
    3395              :  */
    3396              : static void
    3397       351061 : PinBuffer_Locked(BufferDesc *buf)
    3398              : {
    3399              :     uint64      old_buf_state;
    3400              : 
    3401              :     /*
    3402              :      * As explained, We don't expect any preexisting pins. That allows us to
    3403              :      * manipulate the PrivateRefCount after releasing the spinlock
    3404              :      */
    3405              :     Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
    3406              : 
    3407              :     /*
    3408              :      * Since we hold the buffer spinlock, we can update the buffer state and
    3409              :      * release the lock in one operation.
    3410              :      */
    3411       351061 :     old_buf_state = pg_atomic_read_u64(&buf->state);
    3412              : 
    3413       351061 :     UnlockBufHdrExt(buf, old_buf_state,
    3414              :                     0, 0, 1);
    3415              : 
    3416       351061 :     TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
    3417       351061 : }
    3418              : 
    3419              : /*
    3420              :  * Support for waking up another backend that is waiting for the cleanup lock
    3421              :  * to be released using BM_PIN_COUNT_WAITER.
    3422              :  *
    3423              :  * See LockBufferForCleanup().
    3424              :  *
    3425              :  * Expected to be called just after releasing a buffer pin (in a BufferDesc,
    3426              :  * not just reducing the backend-local pincount for the buffer).
    3427              :  */
    3428              : static void
    3429           63 : WakePinCountWaiter(BufferDesc *buf)
    3430              : {
    3431              :     /*
    3432              :      * Acquire the buffer header lock, re-check that there's a waiter. Another
    3433              :      * backend could have unpinned this buffer, and already woken up the
    3434              :      * waiter.
    3435              :      *
    3436              :      * There's no danger of the buffer being replaced after we unpinned it
    3437              :      * above, as it's pinned by the waiter. The waiter removes
    3438              :      * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
    3439              :      * backend waking it up.
    3440              :      */
    3441           63 :     uint64      buf_state = LockBufHdr(buf);
    3442              : 
    3443           63 :     if ((buf_state & BM_PIN_COUNT_WAITER) &&
    3444           63 :         BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    3445           63 :     {
    3446              :         /* we just released the last pin other than the waiter's */
    3447           63 :         int         wait_backend_pgprocno = buf->wait_backend_pgprocno;
    3448              : 
    3449           63 :         UnlockBufHdrExt(buf, buf_state,
    3450              :                         0, BM_PIN_COUNT_WAITER,
    3451              :                         0);
    3452           63 :         ProcSendSignal(wait_backend_pgprocno);
    3453              :     }
    3454              :     else
    3455            0 :         UnlockBufHdr(buf);
    3456           63 : }
    3457              : 
    3458              : /*
    3459              :  * UnpinBuffer -- make buffer available for replacement.
    3460              :  *
    3461              :  * This should be applied only to shared buffers, never local ones.  This
    3462              :  * always adjusts CurrentResourceOwner.
    3463              :  */
    3464              : static void
    3465     50109870 : UnpinBuffer(BufferDesc *buf)
    3466              : {
    3467     50109870 :     Buffer      b = BufferDescriptorGetBuffer(buf);
    3468              : 
    3469     50109870 :     ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
    3470     50109870 :     UnpinBufferNoOwner(buf);
    3471     50109870 : }
    3472              : 
    3473              : static void
    3474     50116425 : UnpinBufferNoOwner(BufferDesc *buf)
    3475              : {
    3476              :     PrivateRefCountEntry *ref;
    3477     50116425 :     Buffer      b = BufferDescriptorGetBuffer(buf);
    3478              : 
    3479              :     Assert(!BufferIsLocal(b));
    3480              : 
    3481              :     /* not moving as we're likely deleting it soon anyway */
    3482     50116425 :     ref = GetPrivateRefCountEntry(b, false);
    3483              :     Assert(ref != NULL);
    3484              :     Assert(ref->data.refcount > 0);
    3485     50116425 :     ref->data.refcount--;
    3486     50116425 :     if (ref->data.refcount == 0)
    3487              :     {
    3488              :         uint64      old_buf_state;
    3489              : 
    3490              :         /*
    3491              :          * Mark buffer non-accessible to Valgrind.
    3492              :          *
    3493              :          * Note that the buffer may have already been marked non-accessible
    3494              :          * within access method code that enforces that buffers are only
    3495              :          * accessed while a buffer lock is held.
    3496              :          */
    3497              :         VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
    3498              : 
    3499              :         /*
    3500              :          * I'd better not still hold the buffer content lock. Can't use
    3501              :          * BufferIsLockedByMe(), as that asserts the buffer is pinned.
    3502              :          */
    3503              :         Assert(!BufferLockHeldByMe(buf));
    3504              : 
    3505              :         /* decrement the shared reference count */
    3506     30311684 :         old_buf_state = pg_atomic_fetch_sub_u64(&buf->state, BUF_REFCOUNT_ONE);
    3507              : 
    3508              :         /* Support LockBufferForCleanup() */
    3509     30311684 :         if (old_buf_state & BM_PIN_COUNT_WAITER)
    3510           62 :             WakePinCountWaiter(buf);
    3511              : 
    3512     30311684 :         ForgetPrivateRefCountEntry(ref);
    3513              :     }
    3514     50116425 : }
    3515              : 
    3516              : /*
    3517              :  * Set up backend-local tracking of a buffer pinned the first time by this
    3518              :  * backend.
    3519              :  */
    3520              : inline void
    3521     77278946 : TrackNewBufferPin(Buffer buf)
    3522              : {
    3523              :     PrivateRefCountEntry *ref;
    3524              : 
    3525     77278946 :     ref = NewPrivateRefCountEntry(buf);
    3526     77278946 :     ref->data.refcount++;
    3527              : 
    3528     77278946 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, buf);
    3529              : 
    3530              :     /*
    3531              :      * This is the first pin for this page by this backend, mark its page as
    3532              :      * defined to valgrind. While the page contents might not actually be
    3533              :      * valid yet, we don't currently guarantee that such pages are marked
    3534              :      * undefined or non-accessible.
    3535              :      *
    3536              :      * It's not necessarily the prettiest to do this here, but otherwise we'd
    3537              :      * need this block of code in multiple places.
    3538              :      */
    3539              :     VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(GetBufferDescriptor(buf - 1)),
    3540              :                               BLCKSZ);
    3541     77278946 : }
    3542              : 
    3543              : #define ST_SORT sort_checkpoint_bufferids
    3544              : #define ST_ELEMENT_TYPE CkptSortItem
    3545              : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
    3546              : #define ST_SCOPE static
    3547              : #define ST_DEFINE
    3548              : #include "lib/sort_template.h"
    3549              : 
    3550              : /*
    3551              :  * BufferSync -- Write out all dirty buffers in the pool.
    3552              :  *
    3553              :  * This is called at checkpoint time to write out all dirty shared buffers.
    3554              :  * The checkpoint request flags should be passed in.  If CHECKPOINT_FAST is
    3555              :  * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
    3556              :  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
    3557              :  * even unlogged buffers, which are otherwise skipped.  The remaining flags
    3558              :  * currently have no effect here.
    3559              :  */
    3560              : static void
    3561         1948 : BufferSync(int flags)
    3562              : {
    3563              :     uint64      buf_state;
    3564              :     int         buf_id;
    3565              :     int         num_to_scan;
    3566              :     int         num_spaces;
    3567              :     int         num_processed;
    3568              :     int         num_written;
    3569         1948 :     CkptTsStatus *per_ts_stat = NULL;
    3570              :     Oid         last_tsid;
    3571              :     binaryheap *ts_heap;
    3572              :     int         i;
    3573         1948 :     uint64      mask = BM_DIRTY;
    3574              :     WritebackContext wb_context;
    3575              : 
    3576              :     /*
    3577              :      * Unless this is a shutdown checkpoint or we have been explicitly told,
    3578              :      * we write only permanent, dirty buffers.  But at shutdown or end of
    3579              :      * recovery, we write all dirty buffers.
    3580              :      */
    3581         1948 :     if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
    3582              :                     CHECKPOINT_FLUSH_UNLOGGED))))
    3583         1060 :         mask |= BM_PERMANENT;
    3584              : 
    3585              :     /*
    3586              :      * Loop over all buffers, and mark the ones that need to be written with
    3587              :      * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_scan), so that we
    3588              :      * can estimate how much work needs to be done.
    3589              :      *
    3590              :      * This allows us to write only those pages that were dirty when the
    3591              :      * checkpoint began, and not those that get dirtied while it proceeds.
    3592              :      * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
    3593              :      * later in this function, or by normal backends or the bgwriter cleaning
    3594              :      * scan, the flag is cleared.  Any buffer dirtied after this point won't
    3595              :      * have the flag set.
    3596              :      *
    3597              :      * Note that if we fail to write some buffer, we may leave buffers with
    3598              :      * BM_CHECKPOINT_NEEDED still set.  This is OK since any such buffer would
    3599              :      * certainly need to be written for the next checkpoint attempt, too.
    3600              :      */
    3601         1948 :     num_to_scan = 0;
    3602     14101100 :     for (buf_id = 0; buf_id < NBuffers; buf_id++)
    3603              :     {
    3604     14099152 :         BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
    3605     14099152 :         uint64      set_bits = 0;
    3606              : 
    3607              :         /*
    3608              :          * Header spinlock is enough to examine BM_DIRTY, see comment in
    3609              :          * SyncOneBuffer.
    3610              :          */
    3611     14099152 :         buf_state = LockBufHdr(bufHdr);
    3612              : 
    3613     14099152 :         if ((buf_state & mask) == mask)
    3614              :         {
    3615              :             CkptSortItem *item;
    3616              : 
    3617       340275 :             set_bits = BM_CHECKPOINT_NEEDED;
    3618              : 
    3619       340275 :             item = &CkptBufferIds[num_to_scan++];
    3620       340275 :             item->buf_id = buf_id;
    3621       340275 :             item->tsId = bufHdr->tag.spcOid;
    3622       340275 :             item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
    3623       340275 :             item->forkNum = BufTagGetForkNum(&bufHdr->tag);
    3624       340275 :             item->blockNum = bufHdr->tag.blockNum;
    3625              :         }
    3626              : 
    3627     14099152 :         UnlockBufHdrExt(bufHdr, buf_state,
    3628              :                         set_bits, 0,
    3629              :                         0);
    3630              : 
    3631              :         /* Check for barrier events in case NBuffers is large. */
    3632     14099152 :         if (ProcSignalBarrierPending)
    3633            0 :             ProcessProcSignalBarrier();
    3634              :     }
    3635              : 
    3636         1948 :     if (num_to_scan == 0)
    3637          739 :         return;                 /* nothing to do */
    3638              : 
    3639         1209 :     WritebackContextInit(&wb_context, &checkpoint_flush_after);
    3640              : 
    3641              :     TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
    3642              : 
    3643              :     /*
    3644              :      * Sort buffers that need to be written to reduce the likelihood of random
    3645              :      * IO. The sorting is also important for the implementation of balancing
    3646              :      * writes between tablespaces. Without balancing writes we'd potentially
    3647              :      * end up writing to the tablespaces one-by-one; possibly overloading the
    3648              :      * underlying system.
    3649              :      */
    3650         1209 :     sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
    3651              : 
    3652         1209 :     num_spaces = 0;
    3653              : 
    3654              :     /*
    3655              :      * Allocate progress status for each tablespace with buffers that need to
    3656              :      * be flushed. This requires the to-be-flushed array to be sorted.
    3657              :      */
    3658         1209 :     last_tsid = InvalidOid;
    3659       341484 :     for (i = 0; i < num_to_scan; i++)
    3660              :     {
    3661              :         CkptTsStatus *s;
    3662              :         Oid         cur_tsid;
    3663              : 
    3664       340275 :         cur_tsid = CkptBufferIds[i].tsId;
    3665              : 
    3666              :         /*
    3667              :          * Grow array of per-tablespace status structs, every time a new
    3668              :          * tablespace is found.
    3669              :          */
    3670       340275 :         if (last_tsid == InvalidOid || last_tsid != cur_tsid)
    3671         1855 :         {
    3672              :             Size        sz;
    3673              : 
    3674         1855 :             num_spaces++;
    3675              : 
    3676              :             /*
    3677              :              * Not worth adding grow-by-power-of-2 logic here - even with a
    3678              :              * few hundred tablespaces this should be fine.
    3679              :              */
    3680         1855 :             sz = sizeof(CkptTsStatus) * num_spaces;
    3681              : 
    3682         1855 :             if (per_ts_stat == NULL)
    3683         1209 :                 per_ts_stat = (CkptTsStatus *) palloc(sz);
    3684              :             else
    3685          646 :                 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
    3686              : 
    3687         1855 :             s = &per_ts_stat[num_spaces - 1];
    3688         1855 :             memset(s, 0, sizeof(*s));
    3689         1855 :             s->tsId = cur_tsid;
    3690              : 
    3691              :             /*
    3692              :              * The first buffer in this tablespace. As CkptBufferIds is sorted
    3693              :              * by tablespace all (s->num_to_scan) buffers in this tablespace
    3694              :              * will follow afterwards.
    3695              :              */
    3696         1855 :             s->index = i;
    3697              : 
    3698              :             /*
    3699              :              * progress_slice will be determined once we know how many buffers
    3700              :              * are in each tablespace, i.e. after this loop.
    3701              :              */
    3702              : 
    3703         1855 :             last_tsid = cur_tsid;
    3704              :         }
    3705              :         else
    3706              :         {
    3707       338420 :             s = &per_ts_stat[num_spaces - 1];
    3708              :         }
    3709              : 
    3710       340275 :         s->num_to_scan++;
    3711              : 
    3712              :         /* Check for barrier events. */
    3713       340275 :         if (ProcSignalBarrierPending)
    3714            0 :             ProcessProcSignalBarrier();
    3715              :     }
    3716              : 
    3717              :     Assert(num_spaces > 0);
    3718              : 
    3719              :     /*
    3720              :      * Build a min-heap over the write-progress in the individual tablespaces,
    3721              :      * and compute how large a portion of the total progress a single
    3722              :      * processed buffer is.
    3723              :      */
    3724         1209 :     ts_heap = binaryheap_allocate(num_spaces,
    3725              :                                   ts_ckpt_progress_comparator,
    3726              :                                   NULL);
    3727              : 
    3728         3064 :     for (i = 0; i < num_spaces; i++)
    3729              :     {
    3730         1855 :         CkptTsStatus *ts_stat = &per_ts_stat[i];
    3731              : 
    3732         1855 :         ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
    3733              : 
    3734         1855 :         binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
    3735              :     }
    3736              : 
    3737         1209 :     binaryheap_build(ts_heap);
    3738              : 
    3739              :     /*
    3740              :      * Iterate through to-be-checkpointed buffers and write the ones (still)
    3741              :      * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
    3742              :      * tablespaces; otherwise the sorting would lead to only one tablespace
    3743              :      * receiving writes at a time, making inefficient use of the hardware.
    3744              :      */
    3745         1209 :     num_processed = 0;
    3746         1209 :     num_written = 0;
    3747       341484 :     while (!binaryheap_empty(ts_heap))
    3748              :     {
    3749       340275 :         BufferDesc *bufHdr = NULL;
    3750              :         CkptTsStatus *ts_stat = (CkptTsStatus *)
    3751       340275 :             DatumGetPointer(binaryheap_first(ts_heap));
    3752              : 
    3753       340275 :         buf_id = CkptBufferIds[ts_stat->index].buf_id;
    3754              :         Assert(buf_id != -1);
    3755              : 
    3756       340275 :         bufHdr = GetBufferDescriptor(buf_id);
    3757              : 
    3758       340275 :         num_processed++;
    3759              : 
    3760              :         /*
    3761              :          * We don't need to acquire the lock here, because we're only looking
    3762              :          * at a single bit. It's possible that someone else writes the buffer
    3763              :          * and clears the flag right after we check, but that doesn't matter
    3764              :          * since SyncOneBuffer will then do nothing.  However, there is a
    3765              :          * further race condition: it's conceivable that between the time we
    3766              :          * examine the bit here and the time SyncOneBuffer acquires the lock,
    3767              :          * someone else not only wrote the buffer but replaced it with another
    3768              :          * page and dirtied it.  In that improbable case, SyncOneBuffer will
    3769              :          * write the buffer though we didn't need to.  It doesn't seem worth
    3770              :          * guarding against this, though.
    3771              :          */
    3772       340275 :         if (pg_atomic_read_u64(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
    3773              :         {
    3774       318523 :             if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
    3775              :             {
    3776              :                 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
    3777       318522 :                 PendingCheckpointerStats.buffers_written++;
    3778       318522 :                 num_written++;
    3779              :             }
    3780              :         }
    3781              : 
    3782              :         /*
    3783              :          * Measure progress independent of actually having to flush the buffer
    3784              :          * - otherwise writing become unbalanced.
    3785              :          */
    3786       340275 :         ts_stat->progress += ts_stat->progress_slice;
    3787       340275 :         ts_stat->num_scanned++;
    3788       340275 :         ts_stat->index++;
    3789              : 
    3790              :         /* Have all the buffers from the tablespace been processed? */
    3791       340275 :         if (ts_stat->num_scanned == ts_stat->num_to_scan)
    3792              :         {
    3793         1855 :             binaryheap_remove_first(ts_heap);
    3794              :         }
    3795              :         else
    3796              :         {
    3797              :             /* update heap with the new progress */
    3798       338420 :             binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
    3799              :         }
    3800              : 
    3801              :         /*
    3802              :          * Sleep to throttle our I/O rate.
    3803              :          *
    3804              :          * (This will check for barrier events even if it doesn't sleep.)
    3805              :          */
    3806       340275 :         CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
    3807              :     }
    3808              : 
    3809              :     /*
    3810              :      * Issue all pending flushes. Only checkpointer calls BufferSync(), so
    3811              :      * IOContext will always be IOCONTEXT_NORMAL.
    3812              :      */
    3813         1209 :     IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
    3814              : 
    3815         1209 :     pfree(per_ts_stat);
    3816         1209 :     per_ts_stat = NULL;
    3817         1209 :     binaryheap_free(ts_heap);
    3818              : 
    3819              :     /*
    3820              :      * Update checkpoint statistics. As noted above, this doesn't include
    3821              :      * buffers written by other backends or bgwriter scan.
    3822              :      */
    3823         1209 :     CheckpointStats.ckpt_bufs_written += num_written;
    3824              : 
    3825              :     TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
    3826              : }
    3827              : 
    3828              : /*
    3829              :  * BgBufferSync -- Write out some dirty buffers in the pool.
    3830              :  *
    3831              :  * This is called periodically by the background writer process.
    3832              :  *
    3833              :  * Returns true if it's appropriate for the bgwriter process to go into
    3834              :  * low-power hibernation mode.  (This happens if the strategy clock-sweep
    3835              :  * has been "lapped" and no buffer allocations have occurred recently,
    3836              :  * or if the bgwriter has been effectively disabled by setting
    3837              :  * bgwriter_lru_maxpages to 0.)
    3838              :  */
    3839              : bool
    3840        15020 : BgBufferSync(WritebackContext *wb_context)
    3841              : {
    3842              :     /* info obtained from freelist.c */
    3843              :     int         strategy_buf_id;
    3844              :     uint32      strategy_passes;
    3845              :     uint32      recent_alloc;
    3846              : 
    3847              :     /*
    3848              :      * Information saved between calls so we can determine the strategy
    3849              :      * point's advance rate and avoid scanning already-cleaned buffers.
    3850              :      */
    3851              :     static bool saved_info_valid = false;
    3852              :     static int  prev_strategy_buf_id;
    3853              :     static uint32 prev_strategy_passes;
    3854              :     static int  next_to_clean;
    3855              :     static uint32 next_passes;
    3856              : 
    3857              :     /* Moving averages of allocation rate and clean-buffer density */
    3858              :     static float smoothed_alloc = 0;
    3859              :     static float smoothed_density = 10.0;
    3860              : 
    3861              :     /* Potentially these could be tunables, but for now, not */
    3862        15020 :     float       smoothing_samples = 16;
    3863        15020 :     float       scan_whole_pool_milliseconds = 120000.0;
    3864              : 
    3865              :     /* Used to compute how far we scan ahead */
    3866              :     long        strategy_delta;
    3867              :     int         bufs_to_lap;
    3868              :     int         bufs_ahead;
    3869              :     float       scans_per_alloc;
    3870              :     int         reusable_buffers_est;
    3871              :     int         upcoming_alloc_est;
    3872              :     int         min_scan_buffers;
    3873              : 
    3874              :     /* Variables for the scanning loop proper */
    3875              :     int         num_to_scan;
    3876              :     int         num_written;
    3877              :     int         reusable_buffers;
    3878              : 
    3879              :     /* Variables for final smoothed_density update */
    3880              :     long        new_strategy_delta;
    3881              :     uint32      new_recent_alloc;
    3882              : 
    3883              :     /*
    3884              :      * Find out where the clock-sweep currently is, and how many buffer
    3885              :      * allocations have happened since our last call.
    3886              :      */
    3887        15020 :     strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
    3888              : 
    3889              :     /* Report buffer alloc counts to pgstat */
    3890        15020 :     PendingBgWriterStats.buf_alloc += recent_alloc;
    3891              : 
    3892              :     /*
    3893              :      * If we're not running the LRU scan, just stop after doing the stats
    3894              :      * stuff.  We mark the saved state invalid so that we can recover sanely
    3895              :      * if LRU scan is turned back on later.
    3896              :      */
    3897        15020 :     if (bgwriter_lru_maxpages <= 0)
    3898              :     {
    3899           44 :         saved_info_valid = false;
    3900           44 :         return true;
    3901              :     }
    3902              : 
    3903              :     /*
    3904              :      * Compute strategy_delta = how many buffers have been scanned by the
    3905              :      * clock-sweep since last time.  If first time through, assume none. Then
    3906              :      * see if we are still ahead of the clock-sweep, and if so, how many
    3907              :      * buffers we could scan before we'd catch up with it and "lap" it. Note:
    3908              :      * weird-looking coding of xxx_passes comparisons are to avoid bogus
    3909              :      * behavior when the passes counts wrap around.
    3910              :      */
    3911        14976 :     if (saved_info_valid)
    3912              :     {
    3913        14338 :         int32       passes_delta = strategy_passes - prev_strategy_passes;
    3914              : 
    3915        14338 :         strategy_delta = strategy_buf_id - prev_strategy_buf_id;
    3916        14338 :         strategy_delta += (long) passes_delta * NBuffers;
    3917              : 
    3918              :         Assert(strategy_delta >= 0);
    3919              : 
    3920        14338 :         if ((int32) (next_passes - strategy_passes) > 0)
    3921              :         {
    3922              :             /* we're one pass ahead of the strategy point */
    3923         2497 :             bufs_to_lap = strategy_buf_id - next_to_clean;
    3924              : #ifdef BGW_DEBUG
    3925              :             elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
    3926              :                  next_passes, next_to_clean,
    3927              :                  strategy_passes, strategy_buf_id,
    3928              :                  strategy_delta, bufs_to_lap);
    3929              : #endif
    3930              :         }
    3931        11841 :         else if (next_passes == strategy_passes &&
    3932         9285 :                  next_to_clean >= strategy_buf_id)
    3933              :         {
    3934              :             /* on same pass, but ahead or at least not behind */
    3935         8373 :             bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
    3936              : #ifdef BGW_DEBUG
    3937              :             elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
    3938              :                  next_passes, next_to_clean,
    3939              :                  strategy_passes, strategy_buf_id,
    3940              :                  strategy_delta, bufs_to_lap);
    3941              : #endif
    3942              :         }
    3943              :         else
    3944              :         {
    3945              :             /*
    3946              :              * We're behind, so skip forward to the strategy point and start
    3947              :              * cleaning from there.
    3948              :              */
    3949              : #ifdef BGW_DEBUG
    3950              :             elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
    3951              :                  next_passes, next_to_clean,
    3952              :                  strategy_passes, strategy_buf_id,
    3953              :                  strategy_delta);
    3954              : #endif
    3955         3468 :             next_to_clean = strategy_buf_id;
    3956         3468 :             next_passes = strategy_passes;
    3957         3468 :             bufs_to_lap = NBuffers;
    3958              :         }
    3959              :     }
    3960              :     else
    3961              :     {
    3962              :         /*
    3963              :          * Initializing at startup or after LRU scanning had been off. Always
    3964              :          * start at the strategy point.
    3965              :          */
    3966              : #ifdef BGW_DEBUG
    3967              :         elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
    3968              :              strategy_passes, strategy_buf_id);
    3969              : #endif
    3970          638 :         strategy_delta = 0;
    3971          638 :         next_to_clean = strategy_buf_id;
    3972          638 :         next_passes = strategy_passes;
    3973          638 :         bufs_to_lap = NBuffers;
    3974              :     }
    3975              : 
    3976              :     /* Update saved info for next time */
    3977        14976 :     prev_strategy_buf_id = strategy_buf_id;
    3978        14976 :     prev_strategy_passes = strategy_passes;
    3979        14976 :     saved_info_valid = true;
    3980              : 
    3981              :     /*
    3982              :      * Compute how many buffers had to be scanned for each new allocation, ie,
    3983              :      * 1/density of reusable buffers, and track a moving average of that.
    3984              :      *
    3985              :      * If the strategy point didn't move, we don't update the density estimate
    3986              :      */
    3987        14976 :     if (strategy_delta > 0 && recent_alloc > 0)
    3988              :     {
    3989         8009 :         scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
    3990         8009 :         smoothed_density += (scans_per_alloc - smoothed_density) /
    3991              :             smoothing_samples;
    3992              :     }
    3993              : 
    3994              :     /*
    3995              :      * Estimate how many reusable buffers there are between the current
    3996              :      * strategy point and where we've scanned ahead to, based on the smoothed
    3997              :      * density estimate.
    3998              :      */
    3999        14976 :     bufs_ahead = NBuffers - bufs_to_lap;
    4000        14976 :     reusable_buffers_est = (float) bufs_ahead / smoothed_density;
    4001              : 
    4002              :     /*
    4003              :      * Track a moving average of recent buffer allocations.  Here, rather than
    4004              :      * a true average we want a fast-attack, slow-decline behavior: we
    4005              :      * immediately follow any increase.
    4006              :      */
    4007        14976 :     if (smoothed_alloc <= (float) recent_alloc)
    4008         4065 :         smoothed_alloc = recent_alloc;
    4009              :     else
    4010        10911 :         smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
    4011              :             smoothing_samples;
    4012              : 
    4013              :     /* Scale the estimate by a GUC to allow more aggressive tuning. */
    4014        14976 :     upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
    4015              : 
    4016              :     /*
    4017              :      * If recent_alloc remains at zero for many cycles, smoothed_alloc will
    4018              :      * eventually underflow to zero, and the underflows produce annoying
    4019              :      * kernel warnings on some platforms.  Once upcoming_alloc_est has gone to
    4020              :      * zero, there's no point in tracking smaller and smaller values of
    4021              :      * smoothed_alloc, so just reset it to exactly zero to avoid this
    4022              :      * syndrome.  It will pop back up as soon as recent_alloc increases.
    4023              :      */
    4024        14976 :     if (upcoming_alloc_est == 0)
    4025         2440 :         smoothed_alloc = 0;
    4026              : 
    4027              :     /*
    4028              :      * Even in cases where there's been little or no buffer allocation
    4029              :      * activity, we want to make a small amount of progress through the buffer
    4030              :      * cache so that as many reusable buffers as possible are clean after an
    4031              :      * idle period.
    4032              :      *
    4033              :      * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
    4034              :      * the BGW will be called during the scan_whole_pool time; slice the
    4035              :      * buffer pool into that many sections.
    4036              :      */
    4037        14976 :     min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
    4038              : 
    4039        14976 :     if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
    4040              :     {
    4041              : #ifdef BGW_DEBUG
    4042              :         elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
    4043              :              upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
    4044              : #endif
    4045         7282 :         upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
    4046              :     }
    4047              : 
    4048              :     /*
    4049              :      * Now write out dirty reusable buffers, working forward from the
    4050              :      * next_to_clean point, until we have lapped the strategy scan, or cleaned
    4051              :      * enough buffers to match our estimate of the next cycle's allocation
    4052              :      * requirements, or hit the bgwriter_lru_maxpages limit.
    4053              :      */
    4054              : 
    4055        14976 :     num_to_scan = bufs_to_lap;
    4056        14976 :     num_written = 0;
    4057        14976 :     reusable_buffers = reusable_buffers_est;
    4058              : 
    4059              :     /* Execute the LRU scan */
    4060      2125384 :     while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
    4061              :     {
    4062      2110409 :         int         sync_state = SyncOneBuffer(next_to_clean, true,
    4063              :                                                wb_context);
    4064              : 
    4065      2110409 :         if (++next_to_clean >= NBuffers)
    4066              :         {
    4067         3313 :             next_to_clean = 0;
    4068         3313 :             next_passes++;
    4069              :         }
    4070      2110409 :         num_to_scan--;
    4071              : 
    4072      2110409 :         if (sync_state & BUF_WRITTEN)
    4073              :         {
    4074        27494 :             reusable_buffers++;
    4075        27494 :             if (++num_written >= bgwriter_lru_maxpages)
    4076              :             {
    4077            1 :                 PendingBgWriterStats.maxwritten_clean++;
    4078            1 :                 break;
    4079              :             }
    4080              :         }
    4081      2082915 :         else if (sync_state & BUF_REUSABLE)
    4082      1640266 :             reusable_buffers++;
    4083              :     }
    4084              : 
    4085        14976 :     PendingBgWriterStats.buf_written_clean += num_written;
    4086              : 
    4087              : #ifdef BGW_DEBUG
    4088              :     elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
    4089              :          recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
    4090              :          smoothed_density, reusable_buffers_est, upcoming_alloc_est,
    4091              :          bufs_to_lap - num_to_scan,
    4092              :          num_written,
    4093              :          reusable_buffers - reusable_buffers_est);
    4094              : #endif
    4095              : 
    4096              :     /*
    4097              :      * Consider the above scan as being like a new allocation scan.
    4098              :      * Characterize its density and update the smoothed one based on it. This
    4099              :      * effectively halves the moving average period in cases where both the
    4100              :      * strategy and the background writer are doing some useful scanning,
    4101              :      * which is helpful because a long memory isn't as desirable on the
    4102              :      * density estimates.
    4103              :      */
    4104        14976 :     new_strategy_delta = bufs_to_lap - num_to_scan;
    4105        14976 :     new_recent_alloc = reusable_buffers - reusable_buffers_est;
    4106        14976 :     if (new_strategy_delta > 0 && new_recent_alloc > 0)
    4107              :     {
    4108        12838 :         scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
    4109        12838 :         smoothed_density += (scans_per_alloc - smoothed_density) /
    4110              :             smoothing_samples;
    4111              : 
    4112              : #ifdef BGW_DEBUG
    4113              :         elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
    4114              :              new_recent_alloc, new_strategy_delta,
    4115              :              scans_per_alloc, smoothed_density);
    4116              : #endif
    4117              :     }
    4118              : 
    4119              :     /* Return true if OK to hibernate */
    4120        14976 :     return (bufs_to_lap == 0 && recent_alloc == 0);
    4121              : }
    4122              : 
    4123              : /*
    4124              :  * SyncOneBuffer -- process a single buffer during syncing.
    4125              :  *
    4126              :  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
    4127              :  * buffers marked recently used, as these are not replacement candidates.
    4128              :  *
    4129              :  * Returns a bitmask containing the following flag bits:
    4130              :  *  BUF_WRITTEN: we wrote the buffer.
    4131              :  *  BUF_REUSABLE: buffer is available for replacement, ie, it has
    4132              :  *      pin count 0 and usage count 0.
    4133              :  *
    4134              :  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
    4135              :  * after locking it, but we don't care all that much.)
    4136              :  */
    4137              : static int
    4138      2428932 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
    4139              : {
    4140      2428932 :     BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
    4141      2428932 :     int         result = 0;
    4142              :     uint64      buf_state;
    4143              :     BufferTag   tag;
    4144              : 
    4145              :     /* Make sure we can handle the pin */
    4146      2428932 :     ReservePrivateRefCountEntry();
    4147      2428932 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    4148              : 
    4149              :     /*
    4150              :      * Check whether buffer needs writing.
    4151              :      *
    4152              :      * We can make this check without taking the buffer content lock so long
    4153              :      * as we mark pages dirty in access methods *before* logging changes with
    4154              :      * XLogInsert(): if someone marks the buffer dirty just after our check we
    4155              :      * don't worry because our checkpoint.redo points before log record for
    4156              :      * upcoming changes and so we are not required to write such dirty buffer.
    4157              :      */
    4158      2428932 :     buf_state = LockBufHdr(bufHdr);
    4159              : 
    4160      2428932 :     if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
    4161      2425527 :         BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
    4162              :     {
    4163      1669941 :         result |= BUF_REUSABLE;
    4164              :     }
    4165       758991 :     else if (skip_recently_used)
    4166              :     {
    4167              :         /* Caller told us not to write recently-used buffers */
    4168       442649 :         UnlockBufHdr(bufHdr);
    4169       442649 :         return result;
    4170              :     }
    4171              : 
    4172      1986283 :     if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
    4173              :     {
    4174              :         /* It's clean, so nothing to do */
    4175      1640267 :         UnlockBufHdr(bufHdr);
    4176      1640267 :         return result;
    4177              :     }
    4178              : 
    4179              :     /*
    4180              :      * Pin it, share-exclusive-lock it, write it.  (FlushBuffer will do
    4181              :      * nothing if the buffer is clean by the time we've locked it.)
    4182              :      */
    4183       346016 :     PinBuffer_Locked(bufHdr);
    4184              : 
    4185       346016 :     FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    4186              : 
    4187       346016 :     tag = bufHdr->tag;
    4188              : 
    4189       346016 :     UnpinBuffer(bufHdr);
    4190              : 
    4191              :     /*
    4192              :      * SyncOneBuffer() is only called by checkpointer and bgwriter, so
    4193              :      * IOContext will always be IOCONTEXT_NORMAL.
    4194              :      */
    4195       346016 :     ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
    4196              : 
    4197       346016 :     return result | BUF_WRITTEN;
    4198              : }
    4199              : 
    4200              : /*
    4201              :  *      AtEOXact_Buffers - clean up at end of transaction.
    4202              :  *
    4203              :  *      As of PostgreSQL 8.0, buffer pins should get released by the
    4204              :  *      ResourceOwner mechanism.  This routine is just a debugging
    4205              :  *      cross-check that no pins remain.
    4206              :  */
    4207              : void
    4208       657673 : AtEOXact_Buffers(bool isCommit)
    4209              : {
    4210       657673 :     CheckForBufferLeaks();
    4211              : 
    4212       657673 :     AtEOXact_LocalBuffers(isCommit);
    4213              : 
    4214              :     Assert(PrivateRefCountOverflowed == 0);
    4215       657673 : }
    4216              : 
    4217              : /*
    4218              :  * Initialize access to shared buffer pool
    4219              :  *
    4220              :  * This is called during backend startup (whether standalone or under the
    4221              :  * postmaster).  It sets up for this backend's access to the already-existing
    4222              :  * buffer pool.
    4223              :  */
    4224              : void
    4225        24575 : InitBufferManagerAccess(void)
    4226              : {
    4227              :     /*
    4228              :      * An advisory limit on the number of pins each backend should hold, based
    4229              :      * on shared_buffers and the maximum number of connections possible.
    4230              :      * That's very pessimistic, but outside toy-sized shared_buffers it should
    4231              :      * allow plenty of pins.  LimitAdditionalPins() and
    4232              :      * GetAdditionalPinLimit() can be used to check the remaining balance.
    4233              :      */
    4234        24575 :     MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS);
    4235              : 
    4236        24575 :     memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
    4237        24575 :     memset(&PrivateRefCountArrayKeys, 0, sizeof(PrivateRefCountArrayKeys));
    4238              : 
    4239        24575 :     PrivateRefCountHash = refcount_create(CurrentMemoryContext, 100, NULL);
    4240              : 
    4241              :     /*
    4242              :      * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
    4243              :      * the corresponding phase of backend shutdown.
    4244              :      */
    4245              :     Assert(MyProc != NULL);
    4246        24575 :     on_shmem_exit(AtProcExit_Buffers, 0);
    4247        24575 : }
    4248              : 
    4249              : /*
    4250              :  * During backend exit, ensure that we released all shared-buffer locks and
    4251              :  * assert that we have no remaining pins.
    4252              :  */
    4253              : static void
    4254        24575 : AtProcExit_Buffers(int code, Datum arg)
    4255              : {
    4256        24575 :     UnlockBuffers();
    4257              : 
    4258        24575 :     CheckForBufferLeaks();
    4259              : 
    4260              :     /* localbuf.c needs a chance too */
    4261        24575 :     AtProcExit_LocalBuffers();
    4262        24575 : }
    4263              : 
    4264              : /*
    4265              :  *      CheckForBufferLeaks - ensure this backend holds no buffer pins
    4266              :  *
    4267              :  *      As of PostgreSQL 8.0, buffer pins should get released by the
    4268              :  *      ResourceOwner mechanism.  This routine is just a debugging
    4269              :  *      cross-check that no pins remain.
    4270              :  */
    4271              : static void
    4272       682248 : CheckForBufferLeaks(void)
    4273              : {
    4274              : #ifdef USE_ASSERT_CHECKING
    4275              :     int         RefCountErrors = 0;
    4276              :     PrivateRefCountEntry *res;
    4277              :     int         i;
    4278              :     char       *s;
    4279              : 
    4280              :     /* check the array */
    4281              :     for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
    4282              :     {
    4283              :         if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
    4284              :         {
    4285              :             res = &PrivateRefCountArray[i];
    4286              : 
    4287              :             s = DebugPrintBufferRefcount(res->buffer);
    4288              :             elog(WARNING, "buffer refcount leak: %s", s);
    4289              :             pfree(s);
    4290              : 
    4291              :             RefCountErrors++;
    4292              :         }
    4293              :     }
    4294              : 
    4295              :     /* if necessary search the hash */
    4296              :     if (PrivateRefCountOverflowed)
    4297              :     {
    4298              :         refcount_iterator iter;
    4299              : 
    4300              :         refcount_start_iterate(PrivateRefCountHash, &iter);
    4301              :         while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
    4302              :         {
    4303              :             s = DebugPrintBufferRefcount(res->buffer);
    4304              :             elog(WARNING, "buffer refcount leak: %s", s);
    4305              :             pfree(s);
    4306              :             RefCountErrors++;
    4307              :         }
    4308              :     }
    4309              : 
    4310              :     Assert(RefCountErrors == 0);
    4311              : #endif
    4312       682248 : }
    4313              : 
    4314              : #ifdef USE_ASSERT_CHECKING
    4315              : /*
    4316              :  * Check for exclusive-locked catalog buffers.  This is the core of
    4317              :  * AssertCouldGetRelation().
    4318              :  *
    4319              :  * A backend would self-deadlock on the content lock if the catalog scan read
    4320              :  * the exclusive-locked buffer.  The main threat is exclusive-locked buffers
    4321              :  * of catalogs used in relcache, because a catcache search on any catalog may
    4322              :  * build that catalog's relcache entry.  We don't have an inventory of
    4323              :  * catalogs relcache uses, so just check buffers of most catalogs.
    4324              :  *
    4325              :  * It's better to minimize waits while holding an exclusive buffer lock, so it
    4326              :  * would be nice to broaden this check not to be catalog-specific.  However,
    4327              :  * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
    4328              :  * read tables.  That is deadlock-free as long as there's no loop in the
    4329              :  * dependency graph: modifying table A may cause an opclass to read table B,
    4330              :  * but it must not cause a read of table A.
    4331              :  */
    4332              : void
    4333              : AssertBufferLocksPermitCatalogRead(void)
    4334              : {
    4335              :     PrivateRefCountEntry *res;
    4336              : 
    4337              :     /* check the array */
    4338              :     for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
    4339              :     {
    4340              :         if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
    4341              :         {
    4342              :             res = &PrivateRefCountArray[i];
    4343              : 
    4344              :             if (res->buffer == InvalidBuffer)
    4345              :                 continue;
    4346              : 
    4347              :             AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
    4348              :         }
    4349              :     }
    4350              : 
    4351              :     /* if necessary search the hash */
    4352              :     if (PrivateRefCountOverflowed)
    4353              :     {
    4354              :         refcount_iterator iter;
    4355              : 
    4356              :         refcount_start_iterate(PrivateRefCountHash, &iter);
    4357              :         while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
    4358              :         {
    4359              :             AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
    4360              :         }
    4361              :     }
    4362              : }
    4363              : 
    4364              : static void
    4365              : AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode)
    4366              : {
    4367              :     BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
    4368              :     BufferTag   tag;
    4369              :     Oid         relid;
    4370              : 
    4371              :     if (mode != BUFFER_LOCK_EXCLUSIVE)
    4372              :         return;
    4373              : 
    4374              :     tag = bufHdr->tag;
    4375              : 
    4376              :     /*
    4377              :      * This relNumber==relid assumption holds until a catalog experiences
    4378              :      * VACUUM FULL or similar.  After a command like that, relNumber will be
    4379              :      * in the normal (non-catalog) range, and we lose the ability to detect
    4380              :      * hazardous access to that catalog.  Calling RelidByRelfilenumber() would
    4381              :      * close that gap, but RelidByRelfilenumber() might then deadlock with a
    4382              :      * held lock.
    4383              :      */
    4384              :     relid = tag.relNumber;
    4385              : 
    4386              :     if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
    4387              :         return;
    4388              : 
    4389              :     Assert(!IsCatalogRelationOid(relid));
    4390              : }
    4391              : #endif
    4392              : 
    4393              : 
    4394              : /*
    4395              :  * Helper routine to issue warnings when a buffer is unexpectedly pinned
    4396              :  */
    4397              : char *
    4398           46 : DebugPrintBufferRefcount(Buffer buffer)
    4399              : {
    4400              :     BufferDesc *buf;
    4401              :     int32       loccount;
    4402              :     char       *result;
    4403              :     ProcNumber  backend;
    4404              :     uint64      buf_state;
    4405              : 
    4406              :     Assert(BufferIsValid(buffer));
    4407           46 :     if (BufferIsLocal(buffer))
    4408              :     {
    4409           16 :         buf = GetLocalBufferDescriptor(-buffer - 1);
    4410           16 :         loccount = LocalRefCount[-buffer - 1];
    4411           16 :         backend = MyProcNumber;
    4412              :     }
    4413              :     else
    4414              :     {
    4415           30 :         buf = GetBufferDescriptor(buffer - 1);
    4416           30 :         loccount = GetPrivateRefCount(buffer);
    4417           30 :         backend = INVALID_PROC_NUMBER;
    4418              :     }
    4419              : 
    4420              :     /* theoretically we should lock the bufHdr here */
    4421           46 :     buf_state = pg_atomic_read_u64(&buf->state);
    4422              : 
    4423           46 :     result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
    4424              :                       buffer,
    4425           46 :                       relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
    4426              :                                      BufTagGetForkNum(&buf->tag)).str,
    4427              :                       buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
    4428              :                       BUF_STATE_GET_REFCOUNT(buf_state), loccount);
    4429           46 :     return result;
    4430              : }
    4431              : 
    4432              : /*
    4433              :  * CheckPointBuffers
    4434              :  *
    4435              :  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
    4436              :  *
    4437              :  * Note: temporary relations do not participate in checkpoints, so they don't
    4438              :  * need to be flushed.
    4439              :  */
    4440              : void
    4441         1948 : CheckPointBuffers(int flags)
    4442              : {
    4443         1948 :     BufferSync(flags);
    4444         1948 : }
    4445              : 
    4446              : /*
    4447              :  * BufferGetBlockNumber
    4448              :  *      Returns the block number associated with a buffer.
    4449              :  *
    4450              :  * Note:
    4451              :  *      Assumes that the buffer is valid and pinned, else the
    4452              :  *      value may be obsolete immediately...
    4453              :  */
    4454              : BlockNumber
    4455     80291147 : BufferGetBlockNumber(Buffer buffer)
    4456              : {
    4457              :     BufferDesc *bufHdr;
    4458              : 
    4459              :     Assert(BufferIsPinned(buffer));
    4460              : 
    4461     80291147 :     if (BufferIsLocal(buffer))
    4462      2561919 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    4463              :     else
    4464     77729228 :         bufHdr = GetBufferDescriptor(buffer - 1);
    4465              : 
    4466              :     /* pinned, so OK to read tag without spinlock */
    4467     80291147 :     return bufHdr->tag.blockNum;
    4468              : }
    4469              : 
    4470              : /*
    4471              :  * BufferGetTag
    4472              :  *      Returns the relfilelocator, fork number and block number associated with
    4473              :  *      a buffer.
    4474              :  */
    4475              : void
    4476     27482901 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
    4477              :              BlockNumber *blknum)
    4478              : {
    4479              :     BufferDesc *bufHdr;
    4480              : 
    4481              :     /* Do the same checks as BufferGetBlockNumber. */
    4482              :     Assert(BufferIsPinned(buffer));
    4483              : 
    4484     27482901 :     if (BufferIsLocal(buffer))
    4485            0 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    4486              :     else
    4487     27482901 :         bufHdr = GetBufferDescriptor(buffer - 1);
    4488              : 
    4489              :     /* pinned, so OK to read tag without spinlock */
    4490     27482901 :     *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
    4491     27482901 :     *forknum = BufTagGetForkNum(&bufHdr->tag);
    4492     27482901 :     *blknum = bufHdr->tag.blockNum;
    4493     27482901 : }
    4494              : 
    4495              : /*
    4496              :  * FlushBuffer
    4497              :  *      Physically write out a shared buffer.
    4498              :  *
    4499              :  * NOTE: this actually just passes the buffer contents to the kernel; the
    4500              :  * real write to disk won't happen until the kernel feels like it.  This
    4501              :  * is okay from our point of view since we can redo the changes from WAL.
    4502              :  * However, we will need to force the changes to disk via fsync before
    4503              :  * we can checkpoint WAL.
    4504              :  *
    4505              :  * The caller must hold a pin on the buffer and have
    4506              :  * (share-)exclusively-locked the buffer contents.
    4507              :  *
    4508              :  * If the caller has an smgr reference for the buffer's relation, pass it
    4509              :  * as the second parameter.  If not, pass NULL.
    4510              :  */
    4511              : static void
    4512       702104 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
    4513              :             IOContext io_context)
    4514              : {
    4515              :     XLogRecPtr  recptr;
    4516              :     ErrorContextCallback errcallback;
    4517              :     instr_time  io_start;
    4518              :     Block       bufBlock;
    4519              : 
    4520              :     Assert(BufferLockHeldByMeInMode(buf, BUFFER_LOCK_EXCLUSIVE) ||
    4521              :            BufferLockHeldByMeInMode(buf, BUFFER_LOCK_SHARE_EXCLUSIVE));
    4522              : 
    4523              :     /*
    4524              :      * Try to start an I/O operation.  If StartBufferIO returns false, then
    4525              :      * someone else flushed the buffer before we could, so we need not do
    4526              :      * anything.
    4527              :      */
    4528       702104 :     if (StartSharedBufferIO(buf, false, true, NULL) == BUFFER_IO_ALREADY_DONE)
    4529           15 :         return;
    4530              : 
    4531              :     /* Setup error traceback support for ereport() */
    4532       702089 :     errcallback.callback = shared_buffer_write_error_callback;
    4533       702089 :     errcallback.arg = buf;
    4534       702089 :     errcallback.previous = error_context_stack;
    4535       702089 :     error_context_stack = &errcallback;
    4536              : 
    4537              :     /* Find smgr relation for buffer */
    4538       702089 :     if (reln == NULL)
    4539       699643 :         reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
    4540              : 
    4541              :     TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
    4542              :                                         buf->tag.blockNum,
    4543              :                                         reln->smgr_rlocator.locator.spcOid,
    4544              :                                         reln->smgr_rlocator.locator.dbOid,
    4545              :                                         reln->smgr_rlocator.locator.relNumber);
    4546              : 
    4547              :     /*
    4548              :      * As we hold at least a share-exclusive lock on the buffer, the LSN
    4549              :      * cannot change during the flush (and thus can't be torn).
    4550              :      */
    4551       702089 :     recptr = BufferGetLSN(buf);
    4552              : 
    4553              :     /*
    4554              :      * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
    4555              :      * rule that log updates must hit disk before any of the data-file changes
    4556              :      * they describe do.
    4557              :      *
    4558              :      * However, this rule does not apply to unlogged relations, which will be
    4559              :      * lost after a crash anyway.  Most unlogged relation pages do not bear
    4560              :      * LSNs since we never emit WAL records for them, and therefore flushing
    4561              :      * up through the buffer LSN would be useless, but harmless.  However,
    4562              :      * some index AMs use LSNs internally to detect concurrent page
    4563              :      * modifications, and therefore unlogged index pages bear "fake" LSNs
    4564              :      * generated by XLogGetFakeLSN.  It is unlikely but possible that the fake
    4565              :      * LSN counter could advance past the WAL insertion point; and if it did
    4566              :      * happen, attempting to flush WAL through that location would fail, with
    4567              :      * disastrous system-wide consequences.  To make sure that can't happen,
    4568              :      * skip the flush if the buffer isn't permanent.
    4569              :      */
    4570       702089 :     if (pg_atomic_read_u64(&buf->state) & BM_PERMANENT)
    4571       700204 :         XLogFlush(recptr);
    4572              : 
    4573              :     /*
    4574              :      * Now it's safe to write the buffer to disk. Note that no one else should
    4575              :      * have been able to write it, while we were busy with log flushing,
    4576              :      * because we got the exclusive right to perform I/O by setting the
    4577              :      * BM_IO_IN_PROGRESS bit.
    4578              :      */
    4579       702089 :     bufBlock = BufHdrGetBlock(buf);
    4580              : 
    4581              :     /* Update page checksum if desired. */
    4582       702089 :     PageSetChecksum((Page) bufBlock, buf->tag.blockNum);
    4583              : 
    4584       702089 :     io_start = pgstat_prepare_io_time(track_io_timing);
    4585              : 
    4586       702089 :     smgrwrite(reln,
    4587       702089 :               BufTagGetForkNum(&buf->tag),
    4588              :               buf->tag.blockNum,
    4589              :               bufBlock,
    4590              :               false);
    4591              : 
    4592              :     /*
    4593              :      * When a strategy is in use, only flushes of dirty buffers already in the
    4594              :      * strategy ring are counted as strategy writes (IOCONTEXT
    4595              :      * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
    4596              :      * statistics tracking.
    4597              :      *
    4598              :      * If a shared buffer initially added to the ring must be flushed before
    4599              :      * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
    4600              :      *
    4601              :      * If a shared buffer which was added to the ring later because the
    4602              :      * current strategy buffer is pinned or in use or because all strategy
    4603              :      * buffers were dirty and rejected (for BAS_BULKREAD operations only)
    4604              :      * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
    4605              :      * (from_ring will be false).
    4606              :      *
    4607              :      * When a strategy is not in use, the write can only be a "regular" write
    4608              :      * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
    4609              :      */
    4610       702089 :     pgstat_count_io_op_time(io_object, io_context,
    4611              :                             IOOP_WRITE, io_start, 1, BLCKSZ);
    4612              : 
    4613       702089 :     pgBufferUsage.shared_blks_written++;
    4614              : 
    4615              :     /*
    4616              :      * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
    4617              :      */
    4618       702089 :     TerminateBufferIO(buf, true, 0, true, false);
    4619              : 
    4620              :     TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
    4621              :                                        buf->tag.blockNum,
    4622              :                                        reln->smgr_rlocator.locator.spcOid,
    4623              :                                        reln->smgr_rlocator.locator.dbOid,
    4624              :                                        reln->smgr_rlocator.locator.relNumber);
    4625              : 
    4626              :     /* Pop the error context stack */
    4627       702089 :     error_context_stack = errcallback.previous;
    4628              : }
    4629              : 
    4630              : /*
    4631              :  * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
    4632              :  * before/after calling FlushBuffer().
    4633              :  */
    4634              : static void
    4635       349592 : FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
    4636              :                     IOObject io_object, IOContext io_context)
    4637              : {
    4638       349592 :     Buffer      buffer = BufferDescriptorGetBuffer(buf);
    4639              : 
    4640       349592 :     BufferLockAcquire(buffer, buf, BUFFER_LOCK_SHARE_EXCLUSIVE);
    4641       349592 :     FlushBuffer(buf, reln, io_object, io_context);
    4642       349592 :     BufferLockUnlock(buffer, buf);
    4643       349592 : }
    4644              : 
    4645              : /*
    4646              :  * RelationGetNumberOfBlocksInFork
    4647              :  *      Determines the current number of pages in the specified relation fork.
    4648              :  *
    4649              :  * Note that the accuracy of the result will depend on the details of the
    4650              :  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
    4651              :  * it might not be.
    4652              :  */
    4653              : BlockNumber
    4654      2627292 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
    4655              : {
    4656      2627292 :     if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
    4657              :     {
    4658              :         /*
    4659              :          * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
    4660              :          * tableam returns the size in bytes - but for the purpose of this
    4661              :          * routine, we want the number of blocks. Therefore divide, rounding
    4662              :          * up.
    4663              :          */
    4664              :         uint64      szbytes;
    4665              : 
    4666      1938238 :         szbytes = table_relation_size(relation, forkNum);
    4667              : 
    4668      1938219 :         return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
    4669              :     }
    4670       689054 :     else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
    4671              :     {
    4672       689054 :         return smgrnblocks(RelationGetSmgr(relation), forkNum);
    4673              :     }
    4674              :     else
    4675              :         Assert(false);
    4676              : 
    4677            0 :     return 0;                   /* keep compiler quiet */
    4678              : }
    4679              : 
    4680              : /*
    4681              :  * BufferIsPermanent
    4682              :  *      Determines whether a buffer will potentially still be around after
    4683              :  *      a crash.  Caller must hold a buffer pin.
    4684              :  */
    4685              : bool
    4686     17213273 : BufferIsPermanent(Buffer buffer)
    4687              : {
    4688              :     BufferDesc *bufHdr;
    4689              : 
    4690              :     /* Local buffers are used only for temp relations. */
    4691     17213273 :     if (BufferIsLocal(buffer))
    4692       828267 :         return false;
    4693              : 
    4694              :     /* Make sure we've got a real buffer, and that we hold a pin on it. */
    4695              :     Assert(BufferIsValid(buffer));
    4696              :     Assert(BufferIsPinned(buffer));
    4697              : 
    4698              :     /*
    4699              :      * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
    4700              :      * need not bother with the buffer header spinlock.  Even if someone else
    4701              :      * changes the buffer header state while we're doing this, the state is
    4702              :      * changed atomically, so we'll read the old value or the new value, but
    4703              :      * not random garbage.
    4704              :      */
    4705     16385006 :     bufHdr = GetBufferDescriptor(buffer - 1);
    4706     16385006 :     return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
    4707              : }
    4708              : 
    4709              : /*
    4710              :  * BufferGetLSNAtomic
    4711              :  *      Retrieves the LSN of the buffer atomically.
    4712              :  *
    4713              :  * This is necessary for some callers who may only hold a share lock on
    4714              :  * the buffer. A share lock allows a concurrent backend to set hint bits
    4715              :  * on the page, which in turn may require a WAL record to be emitted.
    4716              :  *
    4717              :  * On platforms with 8 byte atomic reads/writes, we don't need to do any
    4718              :  * additional locking. On platforms not supporting such 8 byte atomic
    4719              :  * reads/writes, we need to actually take the header lock.
    4720              :  */
    4721              : XLogRecPtr
    4722      8888023 : BufferGetLSNAtomic(Buffer buffer)
    4723              : {
    4724              :     /* Make sure we've got a real buffer, and that we hold a pin on it. */
    4725              :     Assert(BufferIsValid(buffer));
    4726              :     Assert(BufferIsPinned(buffer));
    4727              : 
    4728              : #ifdef PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
    4729      8888023 :     return PageGetLSN(BufferGetPage(buffer));
    4730              : #else
    4731              :     {
    4732              :         char       *page = BufferGetPage(buffer);
    4733              :         BufferDesc *bufHdr;
    4734              :         XLogRecPtr  lsn;
    4735              : 
    4736              :         /*
    4737              :          * If we don't need locking for correctness, fastpath out.
    4738              :          */
    4739              :         if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
    4740              :             return PageGetLSN(page);
    4741              : 
    4742              :         bufHdr = GetBufferDescriptor(buffer - 1);
    4743              :         LockBufHdr(bufHdr);
    4744              :         lsn = PageGetLSN(page);
    4745              :         UnlockBufHdr(bufHdr);
    4746              : 
    4747              :         return lsn;
    4748              :     }
    4749              : #endif
    4750              : }
    4751              : 
    4752              : /* ---------------------------------------------------------------------
    4753              :  *      DropRelationBuffers
    4754              :  *
    4755              :  *      This function removes from the buffer pool all the pages of the
    4756              :  *      specified relation forks that have block numbers >= firstDelBlock.
    4757              :  *      (In particular, with firstDelBlock = 0, all pages are removed.)
    4758              :  *      Dirty pages are simply dropped, without bothering to write them
    4759              :  *      out first.  Therefore, this is NOT rollback-able, and so should be
    4760              :  *      used only with extreme caution!
    4761              :  *
    4762              :  *      Currently, this is called only from smgr.c when the underlying file
    4763              :  *      is about to be deleted or truncated (firstDelBlock is needed for
    4764              :  *      the truncation case).  The data in the affected pages would therefore
    4765              :  *      be deleted momentarily anyway, and there is no point in writing it.
    4766              :  *      It is the responsibility of higher-level code to ensure that the
    4767              :  *      deletion or truncation does not lose any data that could be needed
    4768              :  *      later.  It is also the responsibility of higher-level code to ensure
    4769              :  *      that no other process could be trying to load more pages of the
    4770              :  *      relation into buffers.
    4771              :  * --------------------------------------------------------------------
    4772              :  */
    4773              : void
    4774          794 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
    4775              :                     int nforks, BlockNumber *firstDelBlock)
    4776              : {
    4777              :     int         i;
    4778              :     int         j;
    4779              :     RelFileLocatorBackend rlocator;
    4780              :     BlockNumber nForkBlock[MAX_FORKNUM];
    4781          794 :     uint64      nBlocksToInvalidate = 0;
    4782              : 
    4783          794 :     rlocator = smgr_reln->smgr_rlocator;
    4784              : 
    4785              :     /* If it's a local relation, it's localbuf.c's problem. */
    4786          794 :     if (RelFileLocatorBackendIsTemp(rlocator))
    4787              :     {
    4788          498 :         if (rlocator.backend == MyProcNumber)
    4789          498 :             DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
    4790              :                                      firstDelBlock);
    4791              : 
    4792          536 :         return;
    4793              :     }
    4794              : 
    4795              :     /*
    4796              :      * To remove all the pages of the specified relation forks from the buffer
    4797              :      * pool, we need to scan the entire buffer pool but we can optimize it by
    4798              :      * finding the buffers from BufMapping table provided we know the exact
    4799              :      * size of each fork of the relation. The exact size is required to ensure
    4800              :      * that we don't leave any buffer for the relation being dropped as
    4801              :      * otherwise the background writer or checkpointer can lead to a PANIC
    4802              :      * error while flushing buffers corresponding to files that don't exist.
    4803              :      *
    4804              :      * To know the exact size, we rely on the size cached for each fork by us
    4805              :      * during recovery which limits the optimization to recovery and on
    4806              :      * standbys but we can easily extend it once we have shared cache for
    4807              :      * relation size.
    4808              :      *
    4809              :      * In recovery, we cache the value returned by the first lseek(SEEK_END)
    4810              :      * and the future writes keeps the cached value up-to-date. See
    4811              :      * smgrextend. It is possible that the value of the first lseek is smaller
    4812              :      * than the actual number of existing blocks in the file due to buggy
    4813              :      * Linux kernels that might not have accounted for the recent write. But
    4814              :      * that should be fine because there must not be any buffers after that
    4815              :      * file size.
    4816              :      */
    4817          396 :     for (i = 0; i < nforks; i++)
    4818              :     {
    4819              :         /* Get the number of blocks for a relation's fork */
    4820          344 :         nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
    4821              : 
    4822          344 :         if (nForkBlock[i] == InvalidBlockNumber)
    4823              :         {
    4824          244 :             nBlocksToInvalidate = InvalidBlockNumber;
    4825          244 :             break;
    4826              :         }
    4827              : 
    4828              :         /* calculate the number of blocks to be invalidated */
    4829          100 :         nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
    4830              :     }
    4831              : 
    4832              :     /*
    4833              :      * We apply the optimization iff the total number of blocks to invalidate
    4834              :      * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
    4835              :      */
    4836          296 :     if (BlockNumberIsValid(nBlocksToInvalidate) &&
    4837           52 :         nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
    4838              :     {
    4839          104 :         for (j = 0; j < nforks; j++)
    4840           66 :             FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
    4841           66 :                                        nForkBlock[j], firstDelBlock[j]);
    4842           38 :         return;
    4843              :     }
    4844              : 
    4845      3447042 :     for (i = 0; i < NBuffers; i++)
    4846              :     {
    4847      3446784 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    4848              : 
    4849              :         /*
    4850              :          * We can make this a tad faster by prechecking the buffer tag before
    4851              :          * we attempt to lock the buffer; this saves a lot of lock
    4852              :          * acquisitions in typical cases.  It should be safe because the
    4853              :          * caller must have AccessExclusiveLock on the relation, or some other
    4854              :          * reason to be certain that no one is loading new pages of the rel
    4855              :          * into the buffer pool.  (Otherwise we might well miss such pages
    4856              :          * entirely.)  Therefore, while the tag might be changing while we
    4857              :          * look at it, it can't be changing *to* a value we care about, only
    4858              :          * *away* from such a value.  So false negatives are impossible, and
    4859              :          * false positives are safe because we'll recheck after getting the
    4860              :          * buffer lock.
    4861              :          *
    4862              :          * We could check forkNum and blockNum as well as the rlocator, but
    4863              :          * the incremental win from doing so seems small.
    4864              :          */
    4865      3446784 :         if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
    4866      3438158 :             continue;
    4867              : 
    4868         8626 :         LockBufHdr(bufHdr);
    4869              : 
    4870        21821 :         for (j = 0; j < nforks; j++)
    4871              :         {
    4872        15396 :             if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
    4873        15396 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
    4874         8524 :                 bufHdr->tag.blockNum >= firstDelBlock[j])
    4875              :             {
    4876         2201 :                 InvalidateBuffer(bufHdr);   /* releases spinlock */
    4877         2201 :                 break;
    4878              :             }
    4879              :         }
    4880         8626 :         if (j >= nforks)
    4881         6425 :             UnlockBufHdr(bufHdr);
    4882              :     }
    4883              : }
    4884              : 
    4885              : /* ---------------------------------------------------------------------
    4886              :  *      DropRelationsAllBuffers
    4887              :  *
    4888              :  *      This function removes from the buffer pool all the pages of all
    4889              :  *      forks of the specified relations.  It's equivalent to calling
    4890              :  *      DropRelationBuffers once per fork per relation with firstDelBlock = 0.
    4891              :  *      --------------------------------------------------------------------
    4892              :  */
    4893              : void
    4894        18020 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
    4895              : {
    4896              :     int         i;
    4897        18020 :     int         n = 0;
    4898              :     SMgrRelation *rels;
    4899              :     BlockNumber (*block)[MAX_FORKNUM + 1];
    4900        18020 :     uint64      nBlocksToInvalidate = 0;
    4901              :     RelFileLocator *locators;
    4902        18020 :     bool        cached = true;
    4903              :     bool        use_bsearch;
    4904              : 
    4905        18020 :     if (nlocators == 0)
    4906            0 :         return;
    4907              : 
    4908        18020 :     rels = palloc_array(SMgrRelation, nlocators);   /* non-local relations */
    4909              : 
    4910              :     /* If it's a local relation, it's localbuf.c's problem. */
    4911        78991 :     for (i = 0; i < nlocators; i++)
    4912              :     {
    4913        60971 :         if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
    4914              :         {
    4915         4422 :             if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
    4916         4420 :                 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
    4917              :         }
    4918              :         else
    4919        56549 :             rels[n++] = smgr_reln[i];
    4920              :     }
    4921              : 
    4922              :     /*
    4923              :      * If there are no non-local relations, then we're done. Release the
    4924              :      * memory and return.
    4925              :      */
    4926        18020 :     if (n == 0)
    4927              :     {
    4928         1182 :         pfree(rels);
    4929         1182 :         return;
    4930              :     }
    4931              : 
    4932              :     /*
    4933              :      * This is used to remember the number of blocks for all the relations
    4934              :      * forks.
    4935              :      */
    4936              :     block = (BlockNumber (*)[MAX_FORKNUM + 1])
    4937        16838 :         palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
    4938              : 
    4939              :     /*
    4940              :      * We can avoid scanning the entire buffer pool if we know the exact size
    4941              :      * of each of the given relation forks. See DropRelationBuffers.
    4942              :      */
    4943        35065 :     for (i = 0; i < n && cached; i++)
    4944              :     {
    4945        27261 :         for (int j = 0; j <= MAX_FORKNUM; j++)
    4946              :         {
    4947              :             /* Get the number of blocks for a relation's fork. */
    4948        25020 :             block[i][j] = smgrnblocks_cached(rels[i], j);
    4949              : 
    4950              :             /* We need to only consider the relation forks that exists. */
    4951        25020 :             if (block[i][j] == InvalidBlockNumber)
    4952              :             {
    4953        22594 :                 if (!smgrexists(rels[i], j))
    4954         6608 :                     continue;
    4955        15986 :                 cached = false;
    4956        15986 :                 break;
    4957              :             }
    4958              : 
    4959              :             /* calculate the total number of blocks to be invalidated */
    4960         2426 :             nBlocksToInvalidate += block[i][j];
    4961              :         }
    4962              :     }
    4963              : 
    4964              :     /*
    4965              :      * We apply the optimization iff the total number of blocks to invalidate
    4966              :      * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
    4967              :      */
    4968        16838 :     if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
    4969              :     {
    4970         1406 :         for (i = 0; i < n; i++)
    4971              :         {
    4972         3875 :             for (int j = 0; j <= MAX_FORKNUM; j++)
    4973              :             {
    4974              :                 /* ignore relation forks that doesn't exist */
    4975         3100 :                 if (!BlockNumberIsValid(block[i][j]))
    4976         2316 :                     continue;
    4977              : 
    4978              :                 /* drop all the buffers for a particular relation fork */
    4979          784 :                 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
    4980          784 :                                            j, block[i][j], 0);
    4981              :             }
    4982              :         }
    4983              : 
    4984          631 :         pfree(block);
    4985          631 :         pfree(rels);
    4986          631 :         return;
    4987              :     }
    4988              : 
    4989        16207 :     pfree(block);
    4990        16207 :     locators = palloc_array(RelFileLocator, n); /* non-local relations */
    4991        71981 :     for (i = 0; i < n; i++)
    4992        55774 :         locators[i] = rels[i]->smgr_rlocator.locator;
    4993              : 
    4994              :     /*
    4995              :      * For low number of relations to drop just use a simple walk through, to
    4996              :      * save the bsearch overhead. The threshold to use is rather a guess than
    4997              :      * an exactly determined value, as it depends on many factors (CPU and RAM
    4998              :      * speeds, amount of shared buffers etc.).
    4999              :      */
    5000        16207 :     use_bsearch = n > RELS_BSEARCH_THRESHOLD;
    5001              : 
    5002              :     /* sort the list of rlocators if necessary */
    5003        16207 :     if (use_bsearch)
    5004          218 :         qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
    5005              : 
    5006    189636175 :     for (i = 0; i < NBuffers; i++)
    5007              :     {
    5008    189619968 :         RelFileLocator *rlocator = NULL;
    5009    189619968 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    5010              : 
    5011              :         /*
    5012              :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    5013              :          * saves some cycles.
    5014              :          */
    5015              : 
    5016    189619968 :         if (!use_bsearch)
    5017              :         {
    5018              :             int         j;
    5019              : 
    5020    751854964 :             for (j = 0; j < n; j++)
    5021              :             {
    5022    564840160 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
    5023              :                 {
    5024       106348 :                     rlocator = &locators[j];
    5025       106348 :                     break;
    5026              :                 }
    5027              :             }
    5028              :         }
    5029              :         else
    5030              :         {
    5031              :             RelFileLocator locator;
    5032              : 
    5033      2498816 :             locator = BufTagGetRelFileLocator(&bufHdr->tag);
    5034      2498816 :             rlocator = bsearch(&locator,
    5035              :                                locators, n, sizeof(RelFileLocator),
    5036              :                                rlocator_comparator);
    5037              :         }
    5038              : 
    5039              :         /* buffer doesn't belong to any of the given relfilelocators; skip it */
    5040    189619968 :         if (rlocator == NULL)
    5041    189511833 :             continue;
    5042              : 
    5043       108135 :         LockBufHdr(bufHdr);
    5044       108135 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
    5045       108135 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
    5046              :         else
    5047            0 :             UnlockBufHdr(bufHdr);
    5048              :     }
    5049              : 
    5050        16207 :     pfree(locators);
    5051        16207 :     pfree(rels);
    5052              : }
    5053              : 
    5054              : /* ---------------------------------------------------------------------
    5055              :  *      FindAndDropRelationBuffers
    5056              :  *
    5057              :  *      This function performs look up in BufMapping table and removes from the
    5058              :  *      buffer pool all the pages of the specified relation fork that has block
    5059              :  *      number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
    5060              :  *      pages are removed.)
    5061              :  * --------------------------------------------------------------------
    5062              :  */
    5063              : static void
    5064          850 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
    5065              :                            BlockNumber nForkBlock,
    5066              :                            BlockNumber firstDelBlock)
    5067              : {
    5068              :     BlockNumber curBlock;
    5069              : 
    5070         2044 :     for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
    5071              :     {
    5072              :         uint32      bufHash;    /* hash value for tag */
    5073              :         BufferTag   bufTag;     /* identity of requested block */
    5074              :         LWLock     *bufPartitionLock;   /* buffer partition lock for it */
    5075              :         int         buf_id;
    5076              :         BufferDesc *bufHdr;
    5077              : 
    5078              :         /* create a tag so we can lookup the buffer */
    5079         1194 :         InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
    5080              : 
    5081              :         /* determine its hash code and partition lock ID */
    5082         1194 :         bufHash = BufTableHashCode(&bufTag);
    5083         1194 :         bufPartitionLock = BufMappingPartitionLock(bufHash);
    5084              : 
    5085              :         /* Check that it is in the buffer pool. If not, do nothing. */
    5086         1194 :         LWLockAcquire(bufPartitionLock, LW_SHARED);
    5087         1194 :         buf_id = BufTableLookup(&bufTag, bufHash);
    5088         1194 :         LWLockRelease(bufPartitionLock);
    5089              : 
    5090         1194 :         if (buf_id < 0)
    5091          116 :             continue;
    5092              : 
    5093         1078 :         bufHdr = GetBufferDescriptor(buf_id);
    5094              : 
    5095              :         /*
    5096              :          * We need to lock the buffer header and recheck if the buffer is
    5097              :          * still associated with the same block because the buffer could be
    5098              :          * evicted by some other backend loading blocks for a different
    5099              :          * relation after we release lock on the BufMapping table.
    5100              :          */
    5101         1078 :         LockBufHdr(bufHdr);
    5102              : 
    5103         2156 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
    5104         1078 :             BufTagGetForkNum(&bufHdr->tag) == forkNum &&
    5105         1078 :             bufHdr->tag.blockNum >= firstDelBlock)
    5106         1078 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
    5107              :         else
    5108            0 :             UnlockBufHdr(bufHdr);
    5109              :     }
    5110          850 : }
    5111              : 
    5112              : /* ---------------------------------------------------------------------
    5113              :  *      DropDatabaseBuffers
    5114              :  *
    5115              :  *      This function removes all the buffers in the buffer cache for a
    5116              :  *      particular database.  Dirty pages are simply dropped, without
    5117              :  *      bothering to write them out first.  This is used when we destroy a
    5118              :  *      database, to avoid trying to flush data to disk when the directory
    5119              :  *      tree no longer exists.  Implementation is pretty similar to
    5120              :  *      DropRelationBuffers() which is for destroying just one relation.
    5121              :  * --------------------------------------------------------------------
    5122              :  */
    5123              : void
    5124           82 : DropDatabaseBuffers(Oid dbid)
    5125              : {
    5126              :     int         i;
    5127              : 
    5128              :     /*
    5129              :      * We needn't consider local buffers, since by assumption the target
    5130              :      * database isn't our own.
    5131              :      */
    5132              : 
    5133       644562 :     for (i = 0; i < NBuffers; i++)
    5134              :     {
    5135       644480 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    5136              : 
    5137              :         /*
    5138              :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    5139              :          * saves some cycles.
    5140              :          */
    5141       644480 :         if (bufHdr->tag.dbOid != dbid)
    5142       628891 :             continue;
    5143              : 
    5144        15589 :         LockBufHdr(bufHdr);
    5145        15589 :         if (bufHdr->tag.dbOid == dbid)
    5146        15589 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
    5147              :         else
    5148            0 :             UnlockBufHdr(bufHdr);
    5149              :     }
    5150           82 : }
    5151              : 
    5152              : /* ---------------------------------------------------------------------
    5153              :  *      FlushRelationBuffers
    5154              :  *
    5155              :  *      This function writes all dirty pages of a relation out to disk
    5156              :  *      (or more accurately, out to kernel disk buffers), ensuring that the
    5157              :  *      kernel has an up-to-date view of the relation.
    5158              :  *
    5159              :  *      Generally, the caller should be holding AccessExclusiveLock on the
    5160              :  *      target relation to ensure that no other backend is busy dirtying
    5161              :  *      more blocks of the relation; the effects can't be expected to last
    5162              :  *      after the lock is released.
    5163              :  *
    5164              :  *      XXX currently it sequentially searches the buffer pool, should be
    5165              :  *      changed to more clever ways of searching.  This routine is not
    5166              :  *      used in any performance-critical code paths, so it's not worth
    5167              :  *      adding additional overhead to normal paths to make it go faster.
    5168              :  * --------------------------------------------------------------------
    5169              :  */
    5170              : void
    5171          169 : FlushRelationBuffers(Relation rel)
    5172              : {
    5173              :     int         i;
    5174              :     BufferDesc *bufHdr;
    5175          169 :     SMgrRelation srel = RelationGetSmgr(rel);
    5176              : 
    5177          169 :     if (RelationUsesLocalBuffers(rel))
    5178              :     {
    5179         1212 :         for (i = 0; i < NLocBuffer; i++)
    5180              :         {
    5181              :             uint64      buf_state;
    5182              : 
    5183         1200 :             bufHdr = GetLocalBufferDescriptor(i);
    5184         1200 :             if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
    5185          400 :                 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
    5186              :                  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5187              :             {
    5188              :                 ErrorContextCallback errcallback;
    5189              : 
    5190              :                 /* Setup error traceback support for ereport() */
    5191          392 :                 errcallback.callback = local_buffer_write_error_callback;
    5192          392 :                 errcallback.arg = bufHdr;
    5193          392 :                 errcallback.previous = error_context_stack;
    5194          392 :                 error_context_stack = &errcallback;
    5195              : 
    5196              :                 /* Make sure we can handle the pin */
    5197          392 :                 ReservePrivateRefCountEntry();
    5198          392 :                 ResourceOwnerEnlarge(CurrentResourceOwner);
    5199              : 
    5200              :                 /*
    5201              :                  * Pin/unpin mostly to make valgrind work, but it also seems
    5202              :                  * like the right thing to do.
    5203              :                  */
    5204          392 :                 PinLocalBuffer(bufHdr, false);
    5205              : 
    5206              : 
    5207          392 :                 FlushLocalBuffer(bufHdr, srel);
    5208              : 
    5209          392 :                 UnpinLocalBuffer(BufferDescriptorGetBuffer(bufHdr));
    5210              : 
    5211              :                 /* Pop the error context stack */
    5212          392 :                 error_context_stack = errcallback.previous;
    5213              :             }
    5214              :         }
    5215              : 
    5216           12 :         return;
    5217              :     }
    5218              : 
    5219      1970973 :     for (i = 0; i < NBuffers; i++)
    5220              :     {
    5221              :         uint64      buf_state;
    5222              : 
    5223      1970816 :         bufHdr = GetBufferDescriptor(i);
    5224              : 
    5225              :         /*
    5226              :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    5227              :          * saves some cycles.
    5228              :          */
    5229      1970816 :         if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
    5230      1970566 :             continue;
    5231              : 
    5232              :         /* Make sure we can handle the pin */
    5233          250 :         ReservePrivateRefCountEntry();
    5234          250 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    5235              : 
    5236          250 :         buf_state = LockBufHdr(bufHdr);
    5237          250 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
    5238          250 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5239              :         {
    5240          204 :             PinBuffer_Locked(bufHdr);
    5241          204 :             FlushUnlockedBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5242          204 :             UnpinBuffer(bufHdr);
    5243              :         }
    5244              :         else
    5245           46 :             UnlockBufHdr(bufHdr);
    5246              :     }
    5247              : }
    5248              : 
    5249              : /* ---------------------------------------------------------------------
    5250              :  *      FlushRelationsAllBuffers
    5251              :  *
    5252              :  *      This function flushes out of the buffer pool all the pages of all
    5253              :  *      forks of the specified smgr relations.  It's equivalent to calling
    5254              :  *      FlushRelationBuffers once per relation.  The relations are assumed not
    5255              :  *      to use local buffers.
    5256              :  * --------------------------------------------------------------------
    5257              :  */
    5258              : void
    5259            8 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
    5260              : {
    5261              :     int         i;
    5262              :     SMgrSortArray *srels;
    5263              :     bool        use_bsearch;
    5264              : 
    5265            8 :     if (nrels == 0)
    5266            0 :         return;
    5267              : 
    5268              :     /* fill-in array for qsort */
    5269            8 :     srels = palloc_array(SMgrSortArray, nrels);
    5270              : 
    5271           24 :     for (i = 0; i < nrels; i++)
    5272              :     {
    5273              :         Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
    5274              : 
    5275           16 :         srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
    5276           16 :         srels[i].srel = smgrs[i];
    5277              :     }
    5278              : 
    5279              :     /*
    5280              :      * Save the bsearch overhead for low number of relations to sync. See
    5281              :      * DropRelationsAllBuffers for details.
    5282              :      */
    5283            8 :     use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
    5284              : 
    5285              :     /* sort the list of SMgrRelations if necessary */
    5286            8 :     if (use_bsearch)
    5287            0 :         qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
    5288              : 
    5289       131080 :     for (i = 0; i < NBuffers; i++)
    5290              :     {
    5291       131072 :         SMgrSortArray *srelent = NULL;
    5292       131072 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    5293              :         uint64      buf_state;
    5294              : 
    5295              :         /*
    5296              :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    5297              :          * saves some cycles.
    5298              :          */
    5299              : 
    5300       131072 :         if (!use_bsearch)
    5301              :         {
    5302              :             int         j;
    5303              : 
    5304       390922 :             for (j = 0; j < nrels; j++)
    5305              :             {
    5306       262129 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
    5307              :                 {
    5308         2279 :                     srelent = &srels[j];
    5309         2279 :                     break;
    5310              :                 }
    5311              :             }
    5312              :         }
    5313              :         else
    5314              :         {
    5315              :             RelFileLocator rlocator;
    5316              : 
    5317            0 :             rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
    5318            0 :             srelent = bsearch(&rlocator,
    5319              :                               srels, nrels, sizeof(SMgrSortArray),
    5320              :                               rlocator_comparator);
    5321              :         }
    5322              : 
    5323              :         /* buffer doesn't belong to any of the given relfilelocators; skip it */
    5324       131072 :         if (srelent == NULL)
    5325       128793 :             continue;
    5326              : 
    5327              :         /* Make sure we can handle the pin */
    5328         2279 :         ReservePrivateRefCountEntry();
    5329         2279 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    5330              : 
    5331         2279 :         buf_state = LockBufHdr(bufHdr);
    5332         2279 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
    5333         2279 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5334              :         {
    5335         2242 :             PinBuffer_Locked(bufHdr);
    5336         2242 :             FlushUnlockedBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5337         2242 :             UnpinBuffer(bufHdr);
    5338              :         }
    5339              :         else
    5340           37 :             UnlockBufHdr(bufHdr);
    5341              :     }
    5342              : 
    5343            8 :     pfree(srels);
    5344              : }
    5345              : 
    5346              : /* ---------------------------------------------------------------------
    5347              :  *      RelationCopyStorageUsingBuffer
    5348              :  *
    5349              :  *      Copy fork's data using bufmgr.  Same as RelationCopyStorage but instead
    5350              :  *      of using smgrread and smgrextend this will copy using bufmgr APIs.
    5351              :  *
    5352              :  *      Refer comments atop CreateAndCopyRelationData() for details about
    5353              :  *      'permanent' parameter.
    5354              :  * --------------------------------------------------------------------
    5355              :  */
    5356              : static void
    5357        86210 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
    5358              :                                RelFileLocator dstlocator,
    5359              :                                ForkNumber forkNum, bool permanent)
    5360              : {
    5361              :     Buffer      srcBuf;
    5362              :     Buffer      dstBuf;
    5363              :     Page        srcPage;
    5364              :     Page        dstPage;
    5365              :     bool        use_wal;
    5366              :     BlockNumber nblocks;
    5367              :     BlockNumber blkno;
    5368              :     PGIOAlignedBlock buf;
    5369              :     BufferAccessStrategy bstrategy_src;
    5370              :     BufferAccessStrategy bstrategy_dst;
    5371              :     BlockRangeReadStreamPrivate p;
    5372              :     ReadStream *src_stream;
    5373              :     SMgrRelation src_smgr;
    5374              : 
    5375              :     /*
    5376              :      * In general, we want to write WAL whenever wal_level > 'minimal', but we
    5377              :      * can skip it when copying any fork of an unlogged relation other than
    5378              :      * the init fork.
    5379              :      */
    5380        86210 :     use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
    5381              : 
    5382              :     /* Get number of blocks in the source relation. */
    5383        86210 :     nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
    5384              :                           forkNum);
    5385              : 
    5386              :     /* Nothing to copy; just return. */
    5387        86210 :     if (nblocks == 0)
    5388        15992 :         return;
    5389              : 
    5390              :     /*
    5391              :      * Bulk extend the destination relation of the same size as the source
    5392              :      * relation before starting to copy block by block.
    5393              :      */
    5394        70218 :     memset(buf.data, 0, BLCKSZ);
    5395        70218 :     smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
    5396              :                buf.data, true);
    5397              : 
    5398              :     /* This is a bulk operation, so use buffer access strategies. */
    5399        70218 :     bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
    5400        70218 :     bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
    5401              : 
    5402              :     /* Initialize streaming read */
    5403        70218 :     p.current_blocknum = 0;
    5404        70218 :     p.last_exclusive = nblocks;
    5405        70218 :     src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
    5406              : 
    5407              :     /*
    5408              :      * It is safe to use batchmode as block_range_read_stream_cb takes no
    5409              :      * locks.
    5410              :      */
    5411        70218 :     src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL |
    5412              :                                                  READ_STREAM_USE_BATCHING,
    5413              :                                                  bstrategy_src,
    5414              :                                                  src_smgr,
    5415              :                                                  permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
    5416              :                                                  forkNum,
    5417              :                                                  block_range_read_stream_cb,
    5418              :                                                  &p,
    5419              :                                                  0);
    5420              : 
    5421              :     /* Iterate over each block of the source relation file. */
    5422       331860 :     for (blkno = 0; blkno < nblocks; blkno++)
    5423              :     {
    5424       261644 :         CHECK_FOR_INTERRUPTS();
    5425              : 
    5426              :         /* Read block from source relation. */
    5427       261644 :         srcBuf = read_stream_next_buffer(src_stream, NULL);
    5428       261642 :         LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
    5429       261642 :         srcPage = BufferGetPage(srcBuf);
    5430              : 
    5431       261642 :         dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
    5432              :                                            BufferGetBlockNumber(srcBuf),
    5433              :                                            RBM_ZERO_AND_LOCK, bstrategy_dst,
    5434              :                                            permanent);
    5435       261642 :         dstPage = BufferGetPage(dstBuf);
    5436              : 
    5437       261642 :         START_CRIT_SECTION();
    5438              : 
    5439              :         /* Copy page data from the source to the destination. */
    5440       261642 :         memcpy(dstPage, srcPage, BLCKSZ);
    5441       261642 :         MarkBufferDirty(dstBuf);
    5442              : 
    5443              :         /* WAL-log the copied page. */
    5444       261642 :         if (use_wal)
    5445       146774 :             log_newpage_buffer(dstBuf, true);
    5446              : 
    5447       261642 :         END_CRIT_SECTION();
    5448              : 
    5449       261642 :         UnlockReleaseBuffer(dstBuf);
    5450       261642 :         UnlockReleaseBuffer(srcBuf);
    5451              :     }
    5452              :     Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
    5453        70216 :     read_stream_end(src_stream);
    5454              : 
    5455        70216 :     FreeAccessStrategy(bstrategy_src);
    5456        70216 :     FreeAccessStrategy(bstrategy_dst);
    5457              : }
    5458              : 
    5459              : /* ---------------------------------------------------------------------
    5460              :  *      CreateAndCopyRelationData
    5461              :  *
    5462              :  *      Create destination relation storage and copy all forks from the
    5463              :  *      source relation to the destination.
    5464              :  *
    5465              :  *      Pass permanent as true for permanent relations and false for
    5466              :  *      unlogged relations.  Currently this API is not supported for
    5467              :  *      temporary relations.
    5468              :  * --------------------------------------------------------------------
    5469              :  */
    5470              : void
    5471        66152 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
    5472              :                           RelFileLocator dst_rlocator, bool permanent)
    5473              : {
    5474              :     char        relpersistence;
    5475              :     SMgrRelation src_rel;
    5476              :     SMgrRelation dst_rel;
    5477              : 
    5478              :     /* Set the relpersistence. */
    5479        66152 :     relpersistence = permanent ?
    5480              :         RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
    5481              : 
    5482        66152 :     src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
    5483        66152 :     dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
    5484              : 
    5485              :     /*
    5486              :      * Create and copy all forks of the relation.  During create database we
    5487              :      * have a separate cleanup mechanism which deletes complete database
    5488              :      * directory.  Therefore, each individual relation doesn't need to be
    5489              :      * registered for cleanup.
    5490              :      */
    5491        66152 :     RelationCreateStorage(dst_rlocator, relpersistence, false);
    5492              : 
    5493              :     /* copy main fork. */
    5494        66152 :     RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
    5495              :                                    permanent);
    5496              : 
    5497              :     /* copy those extra forks that exist */
    5498        66150 :     for (ForkNumber forkNum = MAIN_FORKNUM + 1;
    5499       264600 :          forkNum <= MAX_FORKNUM; forkNum++)
    5500              :     {
    5501       198450 :         if (smgrexists(src_rel, forkNum))
    5502              :         {
    5503        20058 :             smgrcreate(dst_rel, forkNum, false);
    5504              : 
    5505              :             /*
    5506              :              * WAL log creation if the relation is persistent, or this is the
    5507              :              * init fork of an unlogged relation.
    5508              :              */
    5509        20058 :             if (permanent || forkNum == INIT_FORKNUM)
    5510        20058 :                 log_smgrcreate(&dst_rlocator, forkNum);
    5511              : 
    5512              :             /* Copy a fork's data, block by block. */
    5513        20058 :             RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
    5514              :                                            permanent);
    5515              :         }
    5516              :     }
    5517        66150 : }
    5518              : 
    5519              : /* ---------------------------------------------------------------------
    5520              :  *      FlushDatabaseBuffers
    5521              :  *
    5522              :  *      This function writes all dirty pages of a database out to disk
    5523              :  *      (or more accurately, out to kernel disk buffers), ensuring that the
    5524              :  *      kernel has an up-to-date view of the database.
    5525              :  *
    5526              :  *      Generally, the caller should be holding an appropriate lock to ensure
    5527              :  *      no other backend is active in the target database; otherwise more
    5528              :  *      pages could get dirtied.
    5529              :  *
    5530              :  *      Note we don't worry about flushing any pages of temporary relations.
    5531              :  *      It's assumed these wouldn't be interesting.
    5532              :  * --------------------------------------------------------------------
    5533              :  */
    5534              : void
    5535            5 : FlushDatabaseBuffers(Oid dbid)
    5536              : {
    5537              :     int         i;
    5538              :     BufferDesc *bufHdr;
    5539              : 
    5540          645 :     for (i = 0; i < NBuffers; i++)
    5541              :     {
    5542              :         uint64      buf_state;
    5543              : 
    5544          640 :         bufHdr = GetBufferDescriptor(i);
    5545              : 
    5546              :         /*
    5547              :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    5548              :          * saves some cycles.
    5549              :          */
    5550          640 :         if (bufHdr->tag.dbOid != dbid)
    5551          478 :             continue;
    5552              : 
    5553              :         /* Make sure we can handle the pin */
    5554          162 :         ReservePrivateRefCountEntry();
    5555          162 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    5556              : 
    5557          162 :         buf_state = LockBufHdr(bufHdr);
    5558          162 :         if (bufHdr->tag.dbOid == dbid &&
    5559          162 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5560              :         {
    5561           55 :             PinBuffer_Locked(bufHdr);
    5562           55 :             FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5563           55 :             UnpinBuffer(bufHdr);
    5564              :         }
    5565              :         else
    5566          107 :             UnlockBufHdr(bufHdr);
    5567              :     }
    5568            5 : }
    5569              : 
    5570              : /*
    5571              :  * Flush a previously, share-exclusively or exclusively, locked and pinned
    5572              :  * buffer to the OS.
    5573              :  */
    5574              : void
    5575           88 : FlushOneBuffer(Buffer buffer)
    5576              : {
    5577              :     BufferDesc *bufHdr;
    5578              : 
    5579              :     /* currently not needed, but no fundamental reason not to support */
    5580              :     Assert(!BufferIsLocal(buffer));
    5581              : 
    5582              :     Assert(BufferIsPinned(buffer));
    5583              : 
    5584           88 :     bufHdr = GetBufferDescriptor(buffer - 1);
    5585              : 
    5586              :     Assert(BufferIsLockedByMe(buffer));
    5587              : 
    5588           88 :     FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5589           88 : }
    5590              : 
    5591              : /*
    5592              :  * ReleaseBuffer -- release the pin on a buffer
    5593              :  */
    5594              : void
    5595     50506001 : ReleaseBuffer(Buffer buffer)
    5596              : {
    5597     50506001 :     if (!BufferIsValid(buffer))
    5598            0 :         elog(ERROR, "bad buffer ID: %d", buffer);
    5599              : 
    5600     50506001 :     if (BufferIsLocal(buffer))
    5601       752050 :         UnpinLocalBuffer(buffer);
    5602              :     else
    5603     49753951 :         UnpinBuffer(GetBufferDescriptor(buffer - 1));
    5604     50506001 : }
    5605              : 
    5606              : /*
    5607              :  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
    5608              :  *
    5609              :  * This is just a, more efficient, shorthand for a common combination.
    5610              :  */
    5611              : void
    5612     50725762 : UnlockReleaseBuffer(Buffer buffer)
    5613              : {
    5614              :     int         mode;
    5615              :     BufferDesc *buf;
    5616              :     PrivateRefCountEntry *ref;
    5617              :     uint64      sub;
    5618              :     uint64      lockstate;
    5619              : 
    5620              :     Assert(BufferIsPinned(buffer));
    5621              : 
    5622     50725762 :     if (BufferIsLocal(buffer))
    5623              :     {
    5624      1381563 :         UnpinLocalBuffer(buffer);
    5625      1381563 :         return;
    5626              :     }
    5627              : 
    5628     49344199 :     ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
    5629              : 
    5630     49344199 :     buf = GetBufferDescriptor(buffer - 1);
    5631              : 
    5632     49344199 :     mode = BufferLockDisownInternal(buffer, buf);
    5633              : 
    5634              :     /* compute state modification for lock release */
    5635     49344199 :     sub = BufferLockReleaseSub(mode);
    5636              : 
    5637              :     /* compute state modification for pin release */
    5638     49344199 :     ref = GetPrivateRefCountEntry(buffer, false);
    5639              :     Assert(ref != NULL);
    5640              :     Assert(ref->data.refcount > 0);
    5641     49344199 :     ref->data.refcount--;
    5642              : 
    5643              :     /* no more backend local pins, reduce shared pin count */
    5644     49344199 :     if (likely(ref->data.refcount == 0))
    5645              :     {
    5646              :         /* See comment in UnpinBufferNoOwner() */
    5647              :         VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
    5648              : 
    5649     46967262 :         sub |= BUF_REFCOUNT_ONE;
    5650     46967262 :         ForgetPrivateRefCountEntry(ref);
    5651              :     }
    5652              : 
    5653              :     /* perform the lock and pin release in one atomic op */
    5654     49344199 :     lockstate = pg_atomic_sub_fetch_u64(&buf->state, sub);
    5655              : 
    5656              :     /* wake up waiters for the lock */
    5657     49344199 :     BufferLockProcessRelease(buf, mode, lockstate);
    5658              : 
    5659              :     /* wake up waiter for the pin release */
    5660     49344199 :     if (lockstate & BM_PIN_COUNT_WAITER)
    5661            1 :         WakePinCountWaiter(buf);
    5662              : 
    5663              :     /*
    5664              :      * Now okay to allow cancel/die interrupts again, which were held when the
    5665              :      * lock was acquired.
    5666              :      */
    5667     49344199 :     RESUME_INTERRUPTS();
    5668              : }
    5669              : 
    5670              : /*
    5671              :  * IncrBufferRefCount
    5672              :  *      Increment the pin count on a buffer that we have *already* pinned
    5673              :  *      at least once.
    5674              :  *
    5675              :  *      This function cannot be used on a buffer we do not have pinned,
    5676              :  *      because it doesn't change the shared buffer state.
    5677              :  */
    5678              : void
    5679     15312618 : IncrBufferRefCount(Buffer buffer)
    5680              : {
    5681              :     Assert(BufferIsPinned(buffer));
    5682     15312618 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    5683     15312618 :     if (BufferIsLocal(buffer))
    5684       469738 :         LocalRefCount[-buffer - 1]++;
    5685              :     else
    5686              :     {
    5687              :         PrivateRefCountEntry *ref;
    5688              : 
    5689     14842880 :         ref = GetPrivateRefCountEntry(buffer, true);
    5690              :         Assert(ref != NULL);
    5691     14842880 :         ref->data.refcount++;
    5692              :     }
    5693     15312618 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
    5694     15312618 : }
    5695              : 
    5696              : /*
    5697              :  * Shared-buffer only helper for MarkBufferDirtyHint() and
    5698              :  * BufferSetHintBits16().
    5699              :  *
    5700              :  * This is separated out because it turns out that the repeated checks for
    5701              :  * local buffers, repeated GetBufferDescriptor() and repeated reading of the
    5702              :  * buffer's state sufficiently hurts the performance of BufferSetHintBits16().
    5703              :  */
    5704              : static inline void
    5705     15162768 : MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate,
    5706              :                           bool buffer_std)
    5707              : {
    5708     15162768 :     Page        page = BufferGetPage(buffer);
    5709              : 
    5710              :     Assert(GetPrivateRefCount(buffer) > 0);
    5711              : 
    5712              :     /* here, either share-exclusive or exclusive lock is OK */
    5713              :     Assert(BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_EXCLUSIVE) ||
    5714              :            BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_SHARE_EXCLUSIVE));
    5715              : 
    5716              :     /*
    5717              :      * This routine might get called many times on the same page, if we are
    5718              :      * making the first scan after commit of an xact that added/deleted many
    5719              :      * tuples. So, be as quick as we can if the buffer is already dirty.
    5720              :      *
    5721              :      * As we are holding (at least) a share-exclusive lock, nobody could have
    5722              :      * cleaned or dirtied the page concurrently, so we can just rely on the
    5723              :      * previously fetched value here without any danger of races.
    5724              :      */
    5725     15162768 :     if (unlikely(!(lockstate & BM_DIRTY)))
    5726              :     {
    5727       425455 :         XLogRecPtr  lsn = InvalidXLogRecPtr;
    5728       425455 :         bool        wal_log = false;
    5729              :         uint64      buf_state;
    5730              : 
    5731              :         /*
    5732              :          * If we need to protect hint bit updates from torn writes, WAL-log a
    5733              :          * full page image of the page. This full page image is only necessary
    5734              :          * if the hint bit update is the first change to the page since the
    5735              :          * last checkpoint.
    5736              :          *
    5737              :          * We don't check full_page_writes here because that logic is included
    5738              :          * when we call XLogInsert() since the value changes dynamically.
    5739              :          */
    5740       425455 :         if (XLogHintBitIsNeeded() && (lockstate & BM_PERMANENT))
    5741              :         {
    5742              :             /*
    5743              :              * If we must not write WAL, due to a relfilelocator-specific
    5744              :              * condition or being in recovery, don't dirty the page.  We can
    5745              :              * set the hint, just not dirty the page as a result so the hint
    5746              :              * is lost when we evict the page or shutdown.
    5747              :              *
    5748              :              * See src/backend/storage/page/README for longer discussion.
    5749              :              */
    5750       509605 :             if (RecoveryInProgress() ||
    5751        86140 :                 RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
    5752       338574 :                 return;
    5753              : 
    5754        84891 :             wal_log = true;
    5755              :         }
    5756              : 
    5757              :         /*
    5758              :          * We must mark the page dirty before we emit the WAL record, as per
    5759              :          * the usual rules, to ensure that BufferSync()/SyncOneBuffer() try to
    5760              :          * flush the buffer, even if we haven't inserted the WAL record yet.
    5761              :          * As we hold at least a share-exclusive lock, checkpoints will wait
    5762              :          * for this backend to be done with the buffer before continuing. If
    5763              :          * we did it the other way round, a checkpoint could start between
    5764              :          * writing the WAL record and marking the buffer dirty.
    5765              :          */
    5766        86881 :         buf_state = LockBufHdr(bufHdr);
    5767              : 
    5768              :         /*
    5769              :          * It should not be possible for the buffer to already be dirty, see
    5770              :          * comment above.
    5771              :          */
    5772              :         Assert(!(buf_state & BM_DIRTY));
    5773              :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    5774        86881 :         UnlockBufHdrExt(bufHdr, buf_state,
    5775              :                         BM_DIRTY,
    5776              :                         0, 0);
    5777              : 
    5778              :         /*
    5779              :          * If the block is already dirty because we either made a change or
    5780              :          * set a hint already, then we don't need to write a full page image.
    5781              :          * Note that aggressive cleaning of blocks dirtied by hint bit setting
    5782              :          * would increase the call rate. Bulk setting of hint bits would
    5783              :          * reduce the call rate...
    5784              :          */
    5785        86881 :         if (wal_log)
    5786        84891 :             lsn = XLogSaveBufferForHint(buffer, buffer_std);
    5787              : 
    5788        86881 :         if (XLogRecPtrIsValid(lsn))
    5789              :         {
    5790              :             /*
    5791              :              * Set the page LSN if we wrote a backup block. To allow backends
    5792              :              * that only hold a share lock on the buffer to read the LSN in a
    5793              :              * tear-free manner, we set the page LSN while holding the buffer
    5794              :              * header lock. This allows any reader of an LSN who holds only a
    5795              :              * share lock to also obtain a buffer header lock before using
    5796              :              * PageGetLSN() to read the LSN in a tear free way. This is done
    5797              :              * in BufferGetLSNAtomic().
    5798              :              *
    5799              :              * If checksums are enabled, you might think we should reset the
    5800              :              * checksum here. That will happen when the page is written
    5801              :              * sometime later in this checkpoint cycle.
    5802              :              */
    5803        57850 :             buf_state = LockBufHdr(bufHdr);
    5804        57850 :             PageSetLSN(page, lsn);
    5805        57850 :             UnlockBufHdr(bufHdr);
    5806              :         }
    5807              : 
    5808        86881 :         pgBufferUsage.shared_blks_dirtied++;
    5809        86881 :         if (VacuumCostActive)
    5810         1677 :             VacuumCostBalance += VacuumCostPageDirty;
    5811              :     }
    5812              : }
    5813              : 
    5814              : /*
    5815              :  * MarkBufferDirtyHint
    5816              :  *
    5817              :  *  Mark a buffer dirty for non-critical changes.
    5818              :  *
    5819              :  * This is essentially the same as MarkBufferDirty, except:
    5820              :  *
    5821              :  * 1. The caller does not write WAL; so if checksums are enabled, we may need
    5822              :  *    to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
    5823              :  * 2. The caller might have only a share-exclusive-lock instead of an
    5824              :  *    exclusive-lock on the buffer's content lock.
    5825              :  * 3. This function does not guarantee that the buffer is always marked dirty
    5826              :  *    (it e.g. can't always on a hot standby), so it cannot be used for
    5827              :  *    important changes.
    5828              :  */
    5829              : inline void
    5830       436852 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
    5831              : {
    5832              :     BufferDesc *bufHdr;
    5833              : 
    5834       436852 :     bufHdr = GetBufferDescriptor(buffer - 1);
    5835              : 
    5836       436852 :     if (!BufferIsValid(buffer))
    5837            0 :         elog(ERROR, "bad buffer ID: %d", buffer);
    5838              : 
    5839       436852 :     if (BufferIsLocal(buffer))
    5840              :     {
    5841        22034 :         MarkLocalBufferDirty(buffer);
    5842        22034 :         return;
    5843              :     }
    5844              : 
    5845       414818 :     MarkSharedBufferDirtyHint(buffer, bufHdr,
    5846       414818 :                               pg_atomic_read_u64(&bufHdr->state),
    5847              :                               buffer_std);
    5848              : }
    5849              : 
    5850              : /*
    5851              :  * Release buffer content locks for shared buffers.
    5852              :  *
    5853              :  * Used to clean up after errors.
    5854              :  *
    5855              :  * Currently, we can expect that resource owner cleanup, via
    5856              :  * ResOwnerReleaseBuffer(), took care of releasing buffer content locks per
    5857              :  * se; the only thing we need to deal with here is clearing any PIN_COUNT
    5858              :  * request that was in progress.
    5859              :  */
    5860              : void
    5861        65334 : UnlockBuffers(void)
    5862              : {
    5863        65334 :     BufferDesc *buf = PinCountWaitBuf;
    5864              : 
    5865        65334 :     if (buf)
    5866              :     {
    5867              :         uint64      buf_state;
    5868            0 :         uint64      unset_bits = 0;
    5869              : 
    5870            0 :         buf_state = LockBufHdr(buf);
    5871              : 
    5872              :         /*
    5873              :          * Don't complain if flag bit not set; it could have been reset but we
    5874              :          * got a cancel/die interrupt before getting the signal.
    5875              :          */
    5876            0 :         if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
    5877            0 :             buf->wait_backend_pgprocno == MyProcNumber)
    5878            0 :             unset_bits = BM_PIN_COUNT_WAITER;
    5879              : 
    5880            0 :         UnlockBufHdrExt(buf, buf_state,
    5881              :                         0, unset_bits,
    5882              :                         0);
    5883              : 
    5884            0 :         PinCountWaitBuf = NULL;
    5885              :     }
    5886        65334 : }
    5887              : 
    5888              : /*
    5889              :  * Acquire the buffer content lock in the specified mode
    5890              :  *
    5891              :  * If the lock is not available, sleep until it is.
    5892              :  *
    5893              :  * Side effect: cancel/die interrupts are held off until lock release.
    5894              :  *
    5895              :  * This uses almost the same locking approach as lwlock.c's
    5896              :  * LWLockAcquire(). See documentation at the top of lwlock.c for a more
    5897              :  * detailed discussion.
    5898              :  *
    5899              :  * The reason that this, and most of the other BufferLock* functions, get both
    5900              :  * the Buffer and BufferDesc* as parameters, is that looking up one from the
    5901              :  * other repeatedly shows up noticeably in profiles.
    5902              :  *
    5903              :  * Callers should provide a constant for mode, for more efficient code
    5904              :  * generation.
    5905              :  */
    5906              : static inline void
    5907    111929527 : BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
    5908              : {
    5909              :     PrivateRefCountEntry *entry;
    5910    111929527 :     int         extraWaits = 0;
    5911              : 
    5912              :     /*
    5913              :      * Get reference to the refcount entry before we hold the lock, it seems
    5914              :      * better to do before holding the lock.
    5915              :      */
    5916    111929527 :     entry = GetPrivateRefCountEntry(buffer, true);
    5917              : 
    5918              :     /*
    5919              :      * We better not already hold a lock on the buffer.
    5920              :      */
    5921              :     Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK);
    5922              : 
    5923              :     /*
    5924              :      * Lock out cancel/die interrupts until we exit the code section protected
    5925              :      * by the content lock.  This ensures that interrupts will not interfere
    5926              :      * with manipulations of data structures in shared memory.
    5927              :      */
    5928    111929527 :     HOLD_INTERRUPTS();
    5929              : 
    5930              :     for (;;)
    5931        20393 :     {
    5932    111949920 :         uint32      wait_event = 0; /* initialized to avoid compiler warning */
    5933              :         bool        mustwait;
    5934              : 
    5935              :         /*
    5936              :          * Try to grab the lock the first time, we're not in the waitqueue
    5937              :          * yet/anymore.
    5938              :          */
    5939    111949920 :         mustwait = BufferLockAttempt(buf_hdr, mode);
    5940              : 
    5941    111949920 :         if (likely(!mustwait))
    5942              :         {
    5943    111928039 :             break;
    5944              :         }
    5945              : 
    5946              :         /*
    5947              :          * Ok, at this point we couldn't grab the lock on the first try. We
    5948              :          * cannot simply queue ourselves to the end of the list and wait to be
    5949              :          * woken up because by now the lock could long have been released.
    5950              :          * Instead add us to the queue and try to grab the lock again. If we
    5951              :          * succeed we need to revert the queuing and be happy, otherwise we
    5952              :          * recheck the lock. If we still couldn't grab it, we know that the
    5953              :          * other locker will see our queue entries when releasing since they
    5954              :          * existed before we checked for the lock.
    5955              :          */
    5956              : 
    5957              :         /* add to the queue */
    5958        21881 :         BufferLockQueueSelf(buf_hdr, mode);
    5959              : 
    5960              :         /* we're now guaranteed to be woken up if necessary */
    5961        21881 :         mustwait = BufferLockAttempt(buf_hdr, mode);
    5962              : 
    5963              :         /* ok, grabbed the lock the second time round, need to undo queueing */
    5964        21881 :         if (!mustwait)
    5965              :         {
    5966         1488 :             BufferLockDequeueSelf(buf_hdr);
    5967         1488 :             break;
    5968              :         }
    5969              : 
    5970        20393 :         switch (mode)
    5971              :         {
    5972        11156 :             case BUFFER_LOCK_EXCLUSIVE:
    5973        11156 :                 wait_event = WAIT_EVENT_BUFFER_EXCLUSIVE;
    5974        11156 :                 break;
    5975           89 :             case BUFFER_LOCK_SHARE_EXCLUSIVE:
    5976           89 :                 wait_event = WAIT_EVENT_BUFFER_SHARE_EXCLUSIVE;
    5977           89 :                 break;
    5978         9148 :             case BUFFER_LOCK_SHARE:
    5979         9148 :                 wait_event = WAIT_EVENT_BUFFER_SHARED;
    5980         9148 :                 break;
    5981              :             case BUFFER_LOCK_UNLOCK:
    5982              :                 pg_unreachable();
    5983              : 
    5984              :         }
    5985        20393 :         pgstat_report_wait_start(wait_event);
    5986              : 
    5987              :         /*
    5988              :          * Wait until awakened.
    5989              :          *
    5990              :          * It is possible that we get awakened for a reason other than being
    5991              :          * signaled by BufferLockWakeup().  If so, loop back and wait again.
    5992              :          * Once we've gotten the lock, re-increment the sema by the number of
    5993              :          * additional signals received.
    5994              :          */
    5995              :         for (;;)
    5996              :         {
    5997        20393 :             PGSemaphoreLock(MyProc->sem);
    5998        20393 :             if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
    5999        20393 :                 break;
    6000            0 :             extraWaits++;
    6001              :         }
    6002              : 
    6003        20393 :         pgstat_report_wait_end();
    6004              : 
    6005              :         /* Retrying, allow BufferLockReleaseSub to release waiters again. */
    6006        20393 :         pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
    6007              :     }
    6008              : 
    6009              :     /* Remember that we now hold this lock */
    6010    111929527 :     entry->data.lockmode = mode;
    6011              : 
    6012              :     /*
    6013              :      * Fix the process wait semaphore's count for any absorbed wakeups.
    6014              :      */
    6015    111929527 :     while (unlikely(extraWaits-- > 0))
    6016            0 :         PGSemaphoreUnlock(MyProc->sem);
    6017    111929527 : }
    6018              : 
    6019              : /*
    6020              :  * Release a previously acquired buffer content lock.
    6021              :  */
    6022              : static void
    6023     64718140 : BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
    6024              : {
    6025              :     BufferLockMode mode;
    6026              :     uint64      oldstate;
    6027              :     uint64      sub;
    6028              : 
    6029     64718140 :     mode = BufferLockDisownInternal(buffer, buf_hdr);
    6030              : 
    6031              :     /*
    6032              :      * Release my hold on lock, after that it can immediately be acquired by
    6033              :      * others, even if we still have to wakeup other waiters.
    6034              :      */
    6035     64718140 :     sub = BufferLockReleaseSub(mode);
    6036              : 
    6037     64718140 :     oldstate = pg_atomic_sub_fetch_u64(&buf_hdr->state, sub);
    6038              : 
    6039     64718140 :     BufferLockProcessRelease(buf_hdr, mode, oldstate);
    6040              : 
    6041              :     /*
    6042              :      * Now okay to allow cancel/die interrupts.
    6043              :      */
    6044     64718140 :     RESUME_INTERRUPTS();
    6045     64718140 : }
    6046              : 
    6047              : 
    6048              : /*
    6049              :  * Acquire the content lock for the buffer, but only if we don't have to wait.
    6050              :  *
    6051              :  * It is allowed to try to conditionally acquire a lock on a buffer that this
    6052              :  * backend has already locked, but the lock acquisition will always fail, even
    6053              :  * if the new lock acquisition does not conflict with an already held lock
    6054              :  * (e.g. two share locks). This is because we currently do not have space to
    6055              :  * track multiple lock ownerships of the same buffer within one backend.  That
    6056              :  * is ok for the current uses of BufferLockConditional().
    6057              :  */
    6058              : static bool
    6059      2133545 : BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
    6060              : {
    6061      2133545 :     PrivateRefCountEntry *entry = GetPrivateRefCountEntry(buffer, true);
    6062              :     bool        mustwait;
    6063              : 
    6064              :     /*
    6065              :      * As described above, if we're trying to lock a buffer this backend
    6066              :      * already has locked, return false, independent of the existing and
    6067              :      * desired lock level.
    6068              :      */
    6069      2133545 :     if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
    6070            0 :         return false;
    6071              : 
    6072              :     /*
    6073              :      * Lock out cancel/die interrupts until we exit the code section protected
    6074              :      * by the content lock.  This ensures that interrupts will not interfere
    6075              :      * with manipulations of data structures in shared memory.
    6076              :      */
    6077      2133545 :     HOLD_INTERRUPTS();
    6078              : 
    6079              :     /* Check for the lock */
    6080      2133545 :     mustwait = BufferLockAttempt(buf_hdr, mode);
    6081              : 
    6082      2133545 :     if (mustwait)
    6083              :     {
    6084              :         /* Failed to get lock, so release interrupt holdoff */
    6085          733 :         RESUME_INTERRUPTS();
    6086              :     }
    6087              :     else
    6088              :     {
    6089      2132812 :         entry->data.lockmode = mode;
    6090              :     }
    6091              : 
    6092      2133545 :     return !mustwait;
    6093              : }
    6094              : 
    6095              : /*
    6096              :  * Internal function that tries to atomically acquire the content lock in the
    6097              :  * passed in mode.
    6098              :  *
    6099              :  * This function will not block waiting for a lock to become free - that's the
    6100              :  * caller's job.
    6101              :  *
    6102              :  * Similar to LWLockAttemptLock().
    6103              :  */
    6104              : static inline bool
    6105    114105346 : BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
    6106              : {
    6107              :     uint64      old_state;
    6108              : 
    6109              :     /*
    6110              :      * Read once outside the loop, later iterations will get the newer value
    6111              :      * via compare & exchange.
    6112              :      */
    6113    114105346 :     old_state = pg_atomic_read_u64(&buf_hdr->state);
    6114              : 
    6115              :     /* loop until we've determined whether we could acquire the lock or not */
    6116              :     while (true)
    6117        14028 :     {
    6118              :         uint64      desired_state;
    6119              :         bool        lock_free;
    6120              : 
    6121    114119374 :         desired_state = old_state;
    6122              : 
    6123    114119374 :         if (mode == BUFFER_LOCK_EXCLUSIVE)
    6124              :         {
    6125     38546108 :             lock_free = (old_state & BM_LOCK_MASK) == 0;
    6126     38546108 :             if (lock_free)
    6127     38521813 :                 desired_state += BM_LOCK_VAL_EXCLUSIVE;
    6128              :         }
    6129     75573266 :         else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6130              :         {
    6131       724293 :             lock_free = (old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) == 0;
    6132       724293 :             if (lock_free)
    6133       724115 :                 desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE;
    6134              :         }
    6135              :         else
    6136              :         {
    6137     74848973 :             lock_free = (old_state & BM_LOCK_VAL_EXCLUSIVE) == 0;
    6138     74848973 :             if (lock_free)
    6139     74829975 :                 desired_state += BM_LOCK_VAL_SHARED;
    6140              :         }
    6141              : 
    6142              :         /*
    6143              :          * Attempt to swap in the state we are expecting. If we didn't see
    6144              :          * lock to be free, that's just the old value. If we saw it as free,
    6145              :          * we'll attempt to mark it acquired. The reason that we always swap
    6146              :          * in the value is that this doubles as a memory barrier. We could try
    6147              :          * to be smarter and only swap in values if we saw the lock as free,
    6148              :          * but benchmark haven't shown it as beneficial so far.
    6149              :          *
    6150              :          * Retry if the value changed since we last looked at it.
    6151              :          */
    6152    114119374 :         if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state,
    6153              :                                                   &old_state, desired_state)))
    6154              :         {
    6155    114105346 :             if (lock_free)
    6156              :             {
    6157              :                 /* Great! Got the lock. */
    6158    114062339 :                 return false;
    6159              :             }
    6160              :             else
    6161        43007 :                 return true;    /* somebody else has the lock */
    6162              :         }
    6163              :     }
    6164              : 
    6165              :     pg_unreachable();
    6166              : }
    6167              : 
    6168              : /*
    6169              :  * Add ourselves to the end of the content lock's wait queue.
    6170              :  */
    6171              : static void
    6172        21881 : BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
    6173              : {
    6174              :     /*
    6175              :      * If we don't have a PGPROC structure, there's no way to wait. This
    6176              :      * should never occur, since MyProc should only be null during shared
    6177              :      * memory initialization.
    6178              :      */
    6179        21881 :     if (MyProc == NULL)
    6180            0 :         elog(PANIC, "cannot wait without a PGPROC structure");
    6181              : 
    6182        21881 :     if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
    6183            0 :         elog(PANIC, "queueing for lock while waiting on another one");
    6184              : 
    6185        21881 :     LockBufHdr(buf_hdr);
    6186              : 
    6187              :     /* setting the flag is protected by the spinlock */
    6188        21881 :     pg_atomic_fetch_or_u64(&buf_hdr->state, BM_LOCK_HAS_WAITERS);
    6189              : 
    6190              :     /*
    6191              :      * These are currently used both for lwlocks and buffer content locks,
    6192              :      * which is acceptable, although not pretty, because a backend can't wait
    6193              :      * for both types of locks at the same time.
    6194              :      */
    6195        21881 :     MyProc->lwWaiting = LW_WS_WAITING;
    6196        21881 :     MyProc->lwWaitMode = mode;
    6197              : 
    6198        21881 :     proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
    6199              : 
    6200              :     /* Can release the mutex now */
    6201        21881 :     UnlockBufHdr(buf_hdr);
    6202        21881 : }
    6203              : 
    6204              : /*
    6205              :  * Remove ourselves from the waitlist.
    6206              :  *
    6207              :  * This is used if we queued ourselves because we thought we needed to sleep
    6208              :  * but, after further checking, we discovered that we don't actually need to
    6209              :  * do so.
    6210              :  */
    6211              : static void
    6212         1488 : BufferLockDequeueSelf(BufferDesc *buf_hdr)
    6213              : {
    6214              :     bool        on_waitlist;
    6215              : 
    6216         1488 :     LockBufHdr(buf_hdr);
    6217              : 
    6218         1488 :     on_waitlist = MyProc->lwWaiting == LW_WS_WAITING;
    6219         1488 :     if (on_waitlist)
    6220         1127 :         proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
    6221              : 
    6222         1488 :     if (proclist_is_empty(&buf_hdr->lock_waiters) &&
    6223         1430 :         (pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS) != 0)
    6224              :     {
    6225         1070 :         pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_HAS_WAITERS);
    6226              :     }
    6227              : 
    6228              :     /* XXX: combine with fetch_and above? */
    6229         1488 :     UnlockBufHdr(buf_hdr);
    6230              : 
    6231              :     /* clear waiting state again, nice for debugging */
    6232         1488 :     if (on_waitlist)
    6233         1127 :         MyProc->lwWaiting = LW_WS_NOT_WAITING;
    6234              :     else
    6235              :     {
    6236          361 :         int         extraWaits = 0;
    6237              : 
    6238              : 
    6239              :         /*
    6240              :          * Somebody else dequeued us and has or will wake us up. Deal with the
    6241              :          * superfluous absorption of a wakeup.
    6242              :          */
    6243              : 
    6244              :         /*
    6245              :          * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
    6246              :          * removed ourselves - they'll have set it.
    6247              :          */
    6248          361 :         pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
    6249              : 
    6250              :         /*
    6251              :          * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
    6252              :          * get reset at some inconvenient point later. Most of the time this
    6253              :          * will immediately return.
    6254              :          */
    6255              :         for (;;)
    6256              :         {
    6257          361 :             PGSemaphoreLock(MyProc->sem);
    6258          361 :             if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
    6259          361 :                 break;
    6260            0 :             extraWaits++;
    6261              :         }
    6262              : 
    6263              :         /*
    6264              :          * Fix the process wait semaphore's count for any absorbed wakeups.
    6265              :          */
    6266          361 :         while (extraWaits-- > 0)
    6267            0 :             PGSemaphoreUnlock(MyProc->sem);
    6268              :     }
    6269         1488 : }
    6270              : 
    6271              : /*
    6272              :  * Stop treating lock as held by current backend.
    6273              :  *
    6274              :  * After calling this function it's the callers responsibility to ensure that
    6275              :  * the lock gets released, even in case of an error. This only is desirable if
    6276              :  * the lock is going to be released in a different process than the process
    6277              :  * that acquired it.
    6278              :  */
    6279              : static inline void
    6280            0 : BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
    6281              : {
    6282            0 :     BufferLockDisownInternal(buffer, buf_hdr);
    6283            0 :     RESUME_INTERRUPTS();
    6284            0 : }
    6285              : 
    6286              : /*
    6287              :  * Stop treating lock as held by current backend.
    6288              :  *
    6289              :  * This is the code that can be shared between actually releasing a lock
    6290              :  * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
    6291              :  * without releasing the lock (BufferLockDisown()).
    6292              :  */
    6293              : static inline int
    6294    114062339 : BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
    6295              : {
    6296              :     BufferLockMode mode;
    6297              :     PrivateRefCountEntry *ref;
    6298              : 
    6299    114062339 :     ref = GetPrivateRefCountEntry(buffer, false);
    6300    114062339 :     if (ref == NULL)
    6301            0 :         elog(ERROR, "lock %d is not held", buffer);
    6302    114062339 :     mode = ref->data.lockmode;
    6303    114062339 :     ref->data.lockmode = BUFFER_LOCK_UNLOCK;
    6304              : 
    6305    114062339 :     return mode;
    6306              : }
    6307              : 
    6308              : /*
    6309              :  * Wakeup all the lockers that currently have a chance to acquire the lock.
    6310              :  *
    6311              :  * wake_exclusive indicates whether exclusive lock waiters should be woken up.
    6312              :  */
    6313              : static void
    6314        20116 : BufferLockWakeup(BufferDesc *buf_hdr, bool wake_exclusive)
    6315              : {
    6316        20116 :     bool        new_wake_in_progress = false;
    6317        20116 :     bool        wake_share_exclusive = true;
    6318              :     proclist_head wakeup;
    6319              :     proclist_mutable_iter iter;
    6320              : 
    6321        20116 :     proclist_init(&wakeup);
    6322              : 
    6323              :     /* lock wait list while collecting backends to wake up */
    6324        20116 :     LockBufHdr(buf_hdr);
    6325              : 
    6326        30086 :     proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
    6327              :     {
    6328        21357 :         PGPROC     *waiter = GetPGProcByNumber(iter.cur);
    6329              : 
    6330              :         /*
    6331              :          * Already woke up a conflicting lock, so skip over this wait list
    6332              :          * entry.
    6333              :          */
    6334        21357 :         if (!wake_exclusive && waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
    6335          605 :             continue;
    6336        20752 :         if (!wake_share_exclusive && waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6337            0 :             continue;
    6338              : 
    6339        20752 :         proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
    6340        20752 :         proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
    6341              : 
    6342              :         /*
    6343              :          * Prevent additional wakeups until retryer gets to run. Backends that
    6344              :          * are just waiting for the lock to become free don't retry
    6345              :          * automatically.
    6346              :          */
    6347        20752 :         new_wake_in_progress = true;
    6348              : 
    6349              :         /*
    6350              :          * Signal that the process isn't on the wait list anymore. This allows
    6351              :          * BufferLockDequeueSelf() to remove itself from the waitlist with a
    6352              :          * proclist_delete(), rather than having to check if it has been
    6353              :          * removed from the list.
    6354              :          */
    6355              :         Assert(waiter->lwWaiting == LW_WS_WAITING);
    6356        20752 :         waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
    6357              : 
    6358              :         /*
    6359              :          * Don't wakeup further waiters after waking a conflicting waiter.
    6360              :          */
    6361        20752 :         if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
    6362              :         {
    6363              :             /*
    6364              :              * Share locks conflict with exclusive locks.
    6365              :              */
    6366         9271 :             wake_exclusive = false;
    6367              :         }
    6368        11481 :         else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6369              :         {
    6370              :             /*
    6371              :              * Share-exclusive locks conflict with share-exclusive and
    6372              :              * exclusive locks.
    6373              :              */
    6374           94 :             wake_exclusive = false;
    6375           94 :             wake_share_exclusive = false;
    6376              :         }
    6377        11387 :         else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
    6378              :         {
    6379              :             /*
    6380              :              * Exclusive locks conflict with all other locks, there's no point
    6381              :              * in waking up anybody else.
    6382              :              */
    6383        11387 :             break;
    6384              :         }
    6385              :     }
    6386              : 
    6387              :     Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS);
    6388              : 
    6389              :     /* unset required flags, and release lock, in one fell swoop */
    6390              :     {
    6391              :         uint64      old_state;
    6392              :         uint64      desired_state;
    6393              : 
    6394        20116 :         old_state = pg_atomic_read_u64(&buf_hdr->state);
    6395              :         while (true)
    6396              :         {
    6397        20143 :             desired_state = old_state;
    6398              : 
    6399              :             /* compute desired flags */
    6400              : 
    6401        20143 :             if (new_wake_in_progress)
    6402        19821 :                 desired_state |= BM_LOCK_WAKE_IN_PROGRESS;
    6403              :             else
    6404          322 :                 desired_state &= ~BM_LOCK_WAKE_IN_PROGRESS;
    6405              : 
    6406        20143 :             if (proclist_is_empty(&buf_hdr->lock_waiters))
    6407        18259 :                 desired_state &= ~BM_LOCK_HAS_WAITERS;
    6408              : 
    6409        20143 :             desired_state &= ~BM_LOCKED;    /* release lock */
    6410              : 
    6411        20143 :             if (pg_atomic_compare_exchange_u64(&buf_hdr->state, &old_state,
    6412              :                                                desired_state))
    6413        20116 :                 break;
    6414              :         }
    6415              :     }
    6416              : 
    6417              :     /* Awaken any waiters I removed from the queue. */
    6418        40868 :     proclist_foreach_modify(iter, &wakeup, lwWaitLink)
    6419              :     {
    6420        20752 :         PGPROC     *waiter = GetPGProcByNumber(iter.cur);
    6421              : 
    6422        20752 :         proclist_delete(&wakeup, iter.cur, lwWaitLink);
    6423              : 
    6424              :         /*
    6425              :          * Guarantee that lwWaiting being unset only becomes visible once the
    6426              :          * unlink from the link has completed. Otherwise the target backend
    6427              :          * could be woken up for other reason and enqueue for a new lock - if
    6428              :          * that happens before the list unlink happens, the list would end up
    6429              :          * being corrupted.
    6430              :          *
    6431              :          * The barrier pairs with the LockBufHdr() when enqueuing for another
    6432              :          * lock.
    6433              :          */
    6434        20752 :         pg_write_barrier();
    6435        20752 :         waiter->lwWaiting = LW_WS_NOT_WAITING;
    6436        20752 :         PGSemaphoreUnlock(waiter->sem);
    6437              :     }
    6438        20116 : }
    6439              : 
    6440              : /*
    6441              :  * Compute subtraction from buffer state for a release of a held lock in
    6442              :  * `mode`.
    6443              :  *
    6444              :  * This is separated from BufferLockUnlock() as we want to combine the lock
    6445              :  * release with other atomic operations when possible, leading to the lock
    6446              :  * release being done in multiple places, each needing to compute what to
    6447              :  * subtract from the lock state.
    6448              :  */
    6449              : static inline uint64
    6450    114062339 : BufferLockReleaseSub(BufferLockMode mode)
    6451              : {
    6452              :     /*
    6453              :      * Turns out that a switch() leads gcc to generate sufficiently worse code
    6454              :      * for this to show up in profiles...
    6455              :      */
    6456    114062339 :     if (mode == BUFFER_LOCK_EXCLUSIVE)
    6457     38521347 :         return BM_LOCK_VAL_EXCLUSIVE;
    6458     75540992 :     else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6459      4109253 :         return BM_LOCK_VAL_SHARE_EXCLUSIVE;
    6460              :     else
    6461              :     {
    6462              :         Assert(mode == BUFFER_LOCK_SHARE);
    6463     71431739 :         return BM_LOCK_VAL_SHARED;
    6464              :     }
    6465              : 
    6466              :     return 0;                   /* keep compiler quiet */
    6467              : }
    6468              : 
    6469              : /*
    6470              :  * Handle work that needs to be done after releasing a lock that was held in
    6471              :  * `mode`, where `lockstate` is the result of the atomic operation modifying
    6472              :  * the state variable.
    6473              :  *
    6474              :  * This is separated from BufferLockUnlock() as we want to combine the lock
    6475              :  * release with other atomic operations when possible, leading to the lock
    6476              :  * release being done in multiple places.
    6477              :  */
    6478              : static void
    6479    114062339 : BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
    6480              : {
    6481    114062339 :     bool        check_waiters = false;
    6482    114062339 :     bool        wake_exclusive = false;
    6483              : 
    6484              :     /* nobody else can have that kind of lock */
    6485              :     Assert(!(lockstate & BM_LOCK_VAL_EXCLUSIVE));
    6486              : 
    6487              :     /*
    6488              :      * If we're still waiting for backends to get scheduled, don't wake them
    6489              :      * up again. Otherwise check if we need to look through the waitqueue to
    6490              :      * wake other backends.
    6491              :      */
    6492    114062339 :     if ((lockstate & BM_LOCK_HAS_WAITERS) &&
    6493        96155 :         !(lockstate & BM_LOCK_WAKE_IN_PROGRESS))
    6494              :     {
    6495        49655 :         if ((lockstate & BM_LOCK_MASK) == 0)
    6496              :         {
    6497              :             /*
    6498              :              * We released a lock and the lock was, in that moment, free. We
    6499              :              * therefore can wake waiters for any kind of lock.
    6500              :              */
    6501        20113 :             check_waiters = true;
    6502        20113 :             wake_exclusive = true;
    6503              :         }
    6504        29542 :         else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6505              :         {
    6506              :             /*
    6507              :              * We released the lock, but another backend still holds a lock.
    6508              :              * We can't have released an exclusive lock, as there couldn't
    6509              :              * have been other lock holders. If we released a share lock, no
    6510              :              * waiters need to be woken up, as there must be other share
    6511              :              * lockers. However, if we held a share-exclusive lock, another
    6512              :              * backend now could acquire a share-exclusive lock.
    6513              :              */
    6514            3 :             check_waiters = true;
    6515            3 :             wake_exclusive = false;
    6516              :         }
    6517              :     }
    6518              : 
    6519              :     /*
    6520              :      * As waking up waiters requires the spinlock to be acquired, only do so
    6521              :      * if necessary.
    6522              :      */
    6523    114062339 :     if (check_waiters)
    6524        20116 :         BufferLockWakeup(buf_hdr, wake_exclusive);
    6525    114062339 : }
    6526              : 
    6527              : /*
    6528              :  * BufferLockHeldByMeInMode - test whether my process holds the content lock
    6529              :  * in the specified mode
    6530              :  *
    6531              :  * This is meant as debug support only.
    6532              :  */
    6533              : static bool
    6534            0 : BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
    6535              : {
    6536              :     PrivateRefCountEntry *entry =
    6537            0 :         GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
    6538              : 
    6539            0 :     if (!entry)
    6540            0 :         return false;
    6541              :     else
    6542            0 :         return entry->data.lockmode == mode;
    6543              : }
    6544              : 
    6545              : /*
    6546              :  * BufferLockHeldByMe - test whether my process holds the content lock in any
    6547              :  * mode
    6548              :  *
    6549              :  * This is meant as debug support only.
    6550              :  */
    6551              : static bool
    6552            0 : BufferLockHeldByMe(BufferDesc *buf_hdr)
    6553              : {
    6554              :     PrivateRefCountEntry *entry =
    6555            0 :         GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
    6556              : 
    6557            0 :     if (!entry)
    6558            0 :         return false;
    6559              :     else
    6560            0 :         return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
    6561              : }
    6562              : 
    6563              : /*
    6564              :  * Release the content lock for the buffer.
    6565              :  */
    6566              : void
    6567     69568970 : UnlockBuffer(Buffer buffer)
    6568              : {
    6569              :     BufferDesc *buf_hdr;
    6570              : 
    6571              :     Assert(BufferIsPinned(buffer));
    6572     69568970 :     if (BufferIsLocal(buffer))
    6573      5200552 :         return;                 /* local buffers need no lock */
    6574              : 
    6575     64368418 :     buf_hdr = GetBufferDescriptor(buffer - 1);
    6576     64368418 :     BufferLockUnlock(buffer, buf_hdr);
    6577              : }
    6578              : 
    6579              : /*
    6580              :  * Acquire the content_lock for the buffer.
    6581              :  */
    6582              : void
    6583    118050394 : LockBufferInternal(Buffer buffer, BufferLockMode mode)
    6584              : {
    6585              :     BufferDesc *buf_hdr;
    6586              : 
    6587              :     /*
    6588              :      * We can't wait if we haven't got a PGPROC.  This should only occur
    6589              :      * during bootstrap or shared memory initialization.  Put an Assert here
    6590              :      * to catch unsafe coding practices.
    6591              :      */
    6592              :     Assert(!(MyProc == NULL && IsUnderPostmaster));
    6593              : 
    6594              :     /* handled in LockBuffer() wrapper */
    6595              :     Assert(mode != BUFFER_LOCK_UNLOCK);
    6596              : 
    6597              :     Assert(BufferIsPinned(buffer));
    6598    118050394 :     if (BufferIsLocal(buffer))
    6599      6470475 :         return;                 /* local buffers need no lock */
    6600              : 
    6601    111579919 :     buf_hdr = GetBufferDescriptor(buffer - 1);
    6602              : 
    6603              :     /*
    6604              :      * Test the most frequent lock modes first. While a switch (mode) would be
    6605              :      * nice, at least gcc generates considerably worse code for it.
    6606              :      *
    6607              :      * Call BufferLockAcquire() with a constant argument for mode, to generate
    6608              :      * more efficient code for the different lock modes.
    6609              :      */
    6610    111579919 :     if (mode == BUFFER_LOCK_SHARE)
    6611     74816880 :         BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE);
    6612     36763039 :     else if (mode == BUFFER_LOCK_EXCLUSIVE)
    6613     36763039 :         BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_EXCLUSIVE);
    6614            0 :     else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6615            0 :         BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE);
    6616              :     else
    6617            0 :         elog(ERROR, "unrecognized buffer lock mode: %d", mode);
    6618              : }
    6619              : 
    6620              : /*
    6621              :  * Acquire the content_lock for the buffer, but only if we don't have to wait.
    6622              :  *
    6623              :  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
    6624              :  */
    6625              : bool
    6626      1845202 : ConditionalLockBuffer(Buffer buffer)
    6627              : {
    6628              :     BufferDesc *buf;
    6629              : 
    6630              :     Assert(BufferIsPinned(buffer));
    6631      1845202 :     if (BufferIsLocal(buffer))
    6632        86177 :         return true;            /* act as though we got it */
    6633              : 
    6634      1759025 :     buf = GetBufferDescriptor(buffer - 1);
    6635              : 
    6636      1759025 :     return BufferLockConditional(buffer, buf, BUFFER_LOCK_EXCLUSIVE);
    6637              : }
    6638              : 
    6639              : /*
    6640              :  * Verify that this backend is pinning the buffer exactly once.
    6641              :  *
    6642              :  * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
    6643              :  * holds a pin on the buffer.  We do not care whether some other backend does.
    6644              :  */
    6645              : void
    6646      2755696 : CheckBufferIsPinnedOnce(Buffer buffer)
    6647              : {
    6648      2755696 :     if (BufferIsLocal(buffer))
    6649              :     {
    6650         1049 :         if (LocalRefCount[-buffer - 1] != 1)
    6651            0 :             elog(ERROR, "incorrect local pin count: %d",
    6652              :                  LocalRefCount[-buffer - 1]);
    6653              :     }
    6654              :     else
    6655              :     {
    6656      2754647 :         if (GetPrivateRefCount(buffer) != 1)
    6657            0 :             elog(ERROR, "incorrect local pin count: %d",
    6658              :                  GetPrivateRefCount(buffer));
    6659              :     }
    6660      2755696 : }
    6661              : 
    6662              : /*
    6663              :  * LockBufferForCleanup - lock a buffer in preparation for deleting items
    6664              :  *
    6665              :  * Items may be deleted from a disk page only when the caller (a) holds an
    6666              :  * exclusive lock on the buffer and (b) has observed that no other backend
    6667              :  * holds a pin on the buffer.  If there is a pin, then the other backend
    6668              :  * might have a pointer into the buffer (for example, a heapscan reference
    6669              :  * to an item --- see README for more details).  It's OK if a pin is added
    6670              :  * after the cleanup starts, however; the newly-arrived backend will be
    6671              :  * unable to look at the page until we release the exclusive lock.
    6672              :  *
    6673              :  * To implement this protocol, a would-be deleter must pin the buffer and
    6674              :  * then call LockBufferForCleanup().  LockBufferForCleanup() is similar to
    6675              :  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
    6676              :  * it has successfully observed pin count = 1.
    6677              :  */
    6678              : void
    6679        27940 : LockBufferForCleanup(Buffer buffer)
    6680              : {
    6681              :     BufferDesc *bufHdr;
    6682        27940 :     TimestampTz waitStart = 0;
    6683        27940 :     bool        waiting = false;
    6684        27940 :     bool        logged_recovery_conflict = false;
    6685              : 
    6686              :     Assert(BufferIsPinned(buffer));
    6687              :     Assert(PinCountWaitBuf == NULL);
    6688              : 
    6689        27940 :     CheckBufferIsPinnedOnce(buffer);
    6690              : 
    6691              :     /*
    6692              :      * We do not yet need to be worried about in-progress AIOs holding a pin,
    6693              :      * as we, so far, only support doing reads via AIO and this function can
    6694              :      * only be called once the buffer is valid (i.e. no read can be in
    6695              :      * flight).
    6696              :      */
    6697              : 
    6698              :     /* Nobody else to wait for */
    6699        27940 :     if (BufferIsLocal(buffer))
    6700           18 :         return;
    6701              : 
    6702        27922 :     bufHdr = GetBufferDescriptor(buffer - 1);
    6703              : 
    6704              :     for (;;)
    6705           70 :     {
    6706              :         uint64      buf_state;
    6707        27992 :         uint64      unset_bits = 0;
    6708              : 
    6709              :         /* Try to acquire lock */
    6710        27992 :         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    6711        27992 :         buf_state = LockBufHdr(bufHdr);
    6712              : 
    6713              :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    6714        27992 :         if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    6715              :         {
    6716              :             /* Successfully acquired exclusive lock with pincount 1 */
    6717        27922 :             UnlockBufHdr(bufHdr);
    6718              : 
    6719              :             /*
    6720              :              * Emit the log message if recovery conflict on buffer pin was
    6721              :              * resolved but the startup process waited longer than
    6722              :              * deadlock_timeout for it.
    6723              :              */
    6724        27922 :             if (logged_recovery_conflict)
    6725            2 :                 LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
    6726              :                                     waitStart, GetCurrentTimestamp(),
    6727              :                                     NULL, false);
    6728              : 
    6729        27922 :             if (waiting)
    6730              :             {
    6731              :                 /* reset ps display to remove the suffix if we added one */
    6732            2 :                 set_ps_display_remove_suffix();
    6733            2 :                 waiting = false;
    6734              :             }
    6735        27922 :             return;
    6736              :         }
    6737              :         /* Failed, so mark myself as waiting for pincount 1 */
    6738           70 :         if (buf_state & BM_PIN_COUNT_WAITER)
    6739              :         {
    6740            0 :             UnlockBufHdr(bufHdr);
    6741            0 :             LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    6742            0 :             elog(ERROR, "multiple backends attempting to wait for pincount 1");
    6743              :         }
    6744           70 :         bufHdr->wait_backend_pgprocno = MyProcNumber;
    6745           70 :         PinCountWaitBuf = bufHdr;
    6746           70 :         UnlockBufHdrExt(bufHdr, buf_state,
    6747              :                         BM_PIN_COUNT_WAITER, 0,
    6748              :                         0);
    6749           70 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    6750              : 
    6751              :         /* Wait to be signaled by UnpinBuffer() */
    6752           70 :         if (InHotStandby)
    6753              :         {
    6754            9 :             if (!waiting)
    6755              :             {
    6756              :                 /* adjust the process title to indicate that it's waiting */
    6757            2 :                 set_ps_display_suffix("waiting");
    6758            2 :                 waiting = true;
    6759              :             }
    6760              : 
    6761              :             /*
    6762              :              * Emit the log message if the startup process is waiting longer
    6763              :              * than deadlock_timeout for recovery conflict on buffer pin.
    6764              :              *
    6765              :              * Skip this if first time through because the startup process has
    6766              :              * not started waiting yet in this case. So, the wait start
    6767              :              * timestamp is set after this logic.
    6768              :              */
    6769            9 :             if (waitStart != 0 && !logged_recovery_conflict)
    6770              :             {
    6771            3 :                 TimestampTz now = GetCurrentTimestamp();
    6772              : 
    6773            3 :                 if (TimestampDifferenceExceeds(waitStart, now,
    6774              :                                                DeadlockTimeout))
    6775              :                 {
    6776            2 :                     LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
    6777              :                                         waitStart, now, NULL, true);
    6778            2 :                     logged_recovery_conflict = true;
    6779              :                 }
    6780              :             }
    6781              : 
    6782              :             /*
    6783              :              * Set the wait start timestamp if logging is enabled and first
    6784              :              * time through.
    6785              :              */
    6786            9 :             if (log_recovery_conflict_waits && waitStart == 0)
    6787            2 :                 waitStart = GetCurrentTimestamp();
    6788              : 
    6789              :             /* Publish the bufid that Startup process waits on */
    6790            9 :             SetStartupBufferPinWaitBufId(buffer - 1);
    6791              :             /* Set alarm and then wait to be signaled by UnpinBuffer() */
    6792            9 :             ResolveRecoveryConflictWithBufferPin();
    6793              :             /* Reset the published bufid */
    6794            9 :             SetStartupBufferPinWaitBufId(-1);
    6795              :         }
    6796              :         else
    6797           61 :             ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
    6798              : 
    6799              :         /*
    6800              :          * Remove flag marking us as waiter. Normally this will not be set
    6801              :          * anymore, but ProcWaitForSignal() can return for other signals as
    6802              :          * well.  We take care to only reset the flag if we're the waiter, as
    6803              :          * theoretically another backend could have started waiting. That's
    6804              :          * impossible with the current usages due to table level locking, but
    6805              :          * better be safe.
    6806              :          */
    6807           70 :         buf_state = LockBufHdr(bufHdr);
    6808           70 :         if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
    6809            7 :             bufHdr->wait_backend_pgprocno == MyProcNumber)
    6810            7 :             unset_bits |= BM_PIN_COUNT_WAITER;
    6811              : 
    6812           70 :         UnlockBufHdrExt(bufHdr, buf_state,
    6813              :                         0, unset_bits,
    6814              :                         0);
    6815              : 
    6816           70 :         PinCountWaitBuf = NULL;
    6817              :         /* Loop back and try again */
    6818              :     }
    6819              : }
    6820              : 
    6821              : /*
    6822              :  * Check called from ProcessRecoveryConflictInterrupts() when Startup process
    6823              :  * requests cancellation of all pin holders that are blocking it.
    6824              :  */
    6825              : bool
    6826            3 : HoldingBufferPinThatDelaysRecovery(void)
    6827              : {
    6828            3 :     int         bufid = GetStartupBufferPinWaitBufId();
    6829              : 
    6830              :     /*
    6831              :      * If we get woken slowly then it's possible that the Startup process was
    6832              :      * already woken by other backends before we got here. Also possible that
    6833              :      * we get here by multiple interrupts or interrupts at inappropriate
    6834              :      * times, so make sure we do nothing if the bufid is not set.
    6835              :      */
    6836            3 :     if (bufid < 0)
    6837            1 :         return false;
    6838              : 
    6839            2 :     if (GetPrivateRefCount(bufid + 1) > 0)
    6840            2 :         return true;
    6841              : 
    6842            0 :     return false;
    6843              : }
    6844              : 
    6845              : /*
    6846              :  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
    6847              :  *
    6848              :  * We won't loop, but just check once to see if the pin count is OK.  If
    6849              :  * not, return false with no lock held.
    6850              :  */
    6851              : bool
    6852       603540 : ConditionalLockBufferForCleanup(Buffer buffer)
    6853              : {
    6854              :     BufferDesc *bufHdr;
    6855              :     uint64      buf_state,
    6856              :                 refcount;
    6857              : 
    6858              :     Assert(BufferIsValid(buffer));
    6859              : 
    6860              :     /* see AIO related comment in LockBufferForCleanup() */
    6861              : 
    6862       603540 :     if (BufferIsLocal(buffer))
    6863              :     {
    6864        12372 :         refcount = LocalRefCount[-buffer - 1];
    6865              :         /* There should be exactly one pin */
    6866              :         Assert(refcount > 0);
    6867        12372 :         if (refcount != 1)
    6868         1540 :             return false;
    6869              :         /* Nobody else to wait for */
    6870        10832 :         return true;
    6871              :     }
    6872              : 
    6873              :     /* There should be exactly one local pin */
    6874       591168 :     refcount = GetPrivateRefCount(buffer);
    6875              :     Assert(refcount);
    6876       591168 :     if (refcount != 1)
    6877          312 :         return false;
    6878              : 
    6879              :     /* Try to acquire lock */
    6880       590856 :     if (!ConditionalLockBuffer(buffer))
    6881           68 :         return false;
    6882              : 
    6883       590788 :     bufHdr = GetBufferDescriptor(buffer - 1);
    6884       590788 :     buf_state = LockBufHdr(bufHdr);
    6885       590788 :     refcount = BUF_STATE_GET_REFCOUNT(buf_state);
    6886              : 
    6887              :     Assert(refcount > 0);
    6888       590788 :     if (refcount == 1)
    6889              :     {
    6890              :         /* Successfully acquired exclusive lock with pincount 1 */
    6891       590558 :         UnlockBufHdr(bufHdr);
    6892       590558 :         return true;
    6893              :     }
    6894              : 
    6895              :     /* Failed, so release the lock */
    6896          230 :     UnlockBufHdr(bufHdr);
    6897          230 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    6898          230 :     return false;
    6899              : }
    6900              : 
    6901              : /*
    6902              :  * IsBufferCleanupOK - as above, but we already have the lock
    6903              :  *
    6904              :  * Check whether it's OK to perform cleanup on a buffer we've already
    6905              :  * locked.  If we observe that the pin count is 1, our exclusive lock
    6906              :  * happens to be a cleanup lock, and we can proceed with anything that
    6907              :  * would have been allowable had we sought a cleanup lock originally.
    6908              :  */
    6909              : bool
    6910         2306 : IsBufferCleanupOK(Buffer buffer)
    6911              : {
    6912              :     BufferDesc *bufHdr;
    6913              :     uint64      buf_state;
    6914              : 
    6915              :     Assert(BufferIsValid(buffer));
    6916              : 
    6917              :     /* see AIO related comment in LockBufferForCleanup() */
    6918              : 
    6919         2306 :     if (BufferIsLocal(buffer))
    6920              :     {
    6921              :         /* There should be exactly one pin */
    6922            0 :         if (LocalRefCount[-buffer - 1] != 1)
    6923            0 :             return false;
    6924              :         /* Nobody else to wait for */
    6925            0 :         return true;
    6926              :     }
    6927              : 
    6928              :     /* There should be exactly one local pin */
    6929         2306 :     if (GetPrivateRefCount(buffer) != 1)
    6930            0 :         return false;
    6931              : 
    6932         2306 :     bufHdr = GetBufferDescriptor(buffer - 1);
    6933              : 
    6934              :     /* caller must hold exclusive lock on buffer */
    6935              :     Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
    6936              : 
    6937         2306 :     buf_state = LockBufHdr(bufHdr);
    6938              : 
    6939              :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    6940         2306 :     if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    6941              :     {
    6942              :         /* pincount is OK. */
    6943         2306 :         UnlockBufHdr(bufHdr);
    6944         2306 :         return true;
    6945              :     }
    6946              : 
    6947            0 :     UnlockBufHdr(bufHdr);
    6948            0 :     return false;
    6949              : }
    6950              : 
    6951              : /*
    6952              :  * Helper for BufferBeginSetHintBits() and BufferSetHintBits16().
    6953              :  *
    6954              :  * This checks if the current lock mode already suffices to allow hint bits
    6955              :  * being set and, if not, whether the current lock can be upgraded.
    6956              :  *
    6957              :  * Updates *lockstate when returning true.
    6958              :  */
    6959              : static inline bool
    6960     15172213 : SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
    6961              : {
    6962              :     uint64      old_state;
    6963              :     PrivateRefCountEntry *ref;
    6964              :     BufferLockMode mode;
    6965              : 
    6966     15172213 :     ref = GetPrivateRefCountEntry(buffer, true);
    6967              : 
    6968     15172213 :     if (ref == NULL)
    6969            0 :         elog(ERROR, "buffer is not pinned");
    6970              : 
    6971     15172213 :     mode = ref->data.lockmode;
    6972     15172213 :     if (mode == BUFFER_LOCK_UNLOCK)
    6973            0 :         elog(ERROR, "buffer is not locked");
    6974              : 
    6975              :     /* we're done if we are already holding a sufficient lock level */
    6976     15172213 :     if (mode == BUFFER_LOCK_EXCLUSIVE || mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6977              :     {
    6978     11786984 :         *lockstate = pg_atomic_read_u64(&buf_hdr->state);
    6979     11786984 :         return true;
    6980              :     }
    6981              : 
    6982              :     /*
    6983              :      * We are only holding a share lock right now, try to upgrade it to
    6984              :      * SHARE_EXCLUSIVE.
    6985              :      */
    6986              :     Assert(mode == BUFFER_LOCK_SHARE);
    6987              : 
    6988      3385229 :     old_state = pg_atomic_read_u64(&buf_hdr->state);
    6989              :     while (true)
    6990           19 :     {
    6991              :         uint64      desired_state;
    6992              : 
    6993      3385248 :         desired_state = old_state;
    6994              : 
    6995              :         /*
    6996              :          * Can't upgrade if somebody else holds the lock in exclusive or
    6997              :          * share-exclusive mode.
    6998              :          */
    6999      3385248 :         if (unlikely((old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) != 0))
    7000              :         {
    7001           88 :             return false;
    7002              :         }
    7003              : 
    7004              :         /* currently held lock state */
    7005      3385160 :         desired_state -= BM_LOCK_VAL_SHARED;
    7006              : 
    7007              :         /* new lock level */
    7008      3385160 :         desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE;
    7009              : 
    7010      3385160 :         if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state,
    7011              :                                                   &old_state, desired_state)))
    7012              :         {
    7013      3385141 :             ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE;
    7014      3385141 :             *lockstate = desired_state;
    7015              : 
    7016      3385141 :             return true;
    7017              :         }
    7018              :     }
    7019              : }
    7020              : 
    7021              : /*
    7022              :  * Try to acquire the right to set hint bits on the buffer.
    7023              :  *
    7024              :  * To be allowed to set hint bits, this backend needs to hold either a
    7025              :  * share-exclusive or an exclusive lock. In case this backend only holds a
    7026              :  * share lock, this function will try to upgrade the lock to
    7027              :  * share-exclusive. The caller is only allowed to set hint bits if true is
    7028              :  * returned.
    7029              :  *
    7030              :  * Once BufferBeginSetHintBits() has returned true, hint bits may be set
    7031              :  * without further calls to BufferBeginSetHintBits(), until the buffer is
    7032              :  * unlocked.
    7033              :  *
    7034              :  *
    7035              :  * Requiring a share-exclusive lock to set hint bits prevents setting hint
    7036              :  * bits on buffers that are currently being written out, which could corrupt
    7037              :  * the checksum on the page. Flushing buffers also requires a share-exclusive
    7038              :  * lock.
    7039              :  *
    7040              :  * Due to a lock >= share-exclusive being required to set hint bits, only one
    7041              :  * backend can set hint bits at a time. Allowing multiple backends to set hint
    7042              :  * bits would require more complicated locking: For setting hint bits we'd
    7043              :  * need to store the count of backends currently setting hint bits, for I/O we
    7044              :  * would need another lock-level conflicting with the hint-setting
    7045              :  * lock-level. Given that the share-exclusive lock for setting hint bits is
    7046              :  * only held for a short time, that backends often would just set the same
    7047              :  * hint bits and that the cost of occasionally not setting hint bits in hotly
    7048              :  * accessed pages is fairly low, this seems like an acceptable tradeoff.
    7049              :  */
    7050              : bool
    7051       426578 : BufferBeginSetHintBits(Buffer buffer)
    7052              : {
    7053              :     BufferDesc *buf_hdr;
    7054              :     uint64      lockstate;
    7055              : 
    7056       426578 :     if (BufferIsLocal(buffer))
    7057              :     {
    7058              :         /*
    7059              :          * NB: Will need to check if there is a write in progress, once it is
    7060              :          * possible for writes to be done asynchronously.
    7061              :          */
    7062         2388 :         return true;
    7063              :     }
    7064              : 
    7065       424190 :     buf_hdr = GetBufferDescriptor(buffer - 1);
    7066              : 
    7067       424190 :     return SharedBufferBeginSetHintBits(buffer, buf_hdr, &lockstate);
    7068              : }
    7069              : 
    7070              : /*
    7071              :  * End a phase of setting hint bits on this buffer, started with
    7072              :  * BufferBeginSetHintBits().
    7073              :  *
    7074              :  * This would strictly speaking not be required (i.e. the caller could do
    7075              :  * MarkBufferDirtyHint() if so desired), but allows us to perform some sanity
    7076              :  * checks.
    7077              :  */
    7078              : void
    7079       426563 : BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std)
    7080              : {
    7081              :     if (!BufferIsLocal(buffer))
    7082              :         Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE) ||
    7083              :                BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
    7084              : 
    7085       426563 :     if (mark_dirty)
    7086       234237 :         MarkBufferDirtyHint(buffer, buffer_std);
    7087       426563 : }
    7088              : 
    7089              : /*
    7090              :  * Try to set hint bits on a single 16bit value in a buffer.
    7091              :  *
    7092              :  * If hint bits are allowed to be set, set *ptr = val, try to mark the buffer
    7093              :  * dirty and return true. Otherwise false is returned.
    7094              :  *
    7095              :  * *ptr needs to be a pointer to memory within the buffer.
    7096              :  *
    7097              :  * This is a bit faster than BufferBeginSetHintBits() /
    7098              :  * BufferFinishSetHintBits() when setting hints once in a buffer, but slower
    7099              :  * than the former when setting hint bits multiple times in the same buffer.
    7100              :  */
    7101              : bool
    7102     15551556 : BufferSetHintBits16(uint16 *ptr, uint16 val, Buffer buffer)
    7103              : {
    7104              :     BufferDesc *buf_hdr;
    7105              :     uint64      lockstate;
    7106              : #ifdef USE_ASSERT_CHECKING
    7107              :     char       *page;
    7108              : 
    7109              :     /* verify that the address is on the page */
    7110              :     page = BufferGetPage(buffer);
    7111              :     Assert((char *) ptr >= page && (char *) ptr < (page + BLCKSZ));
    7112              : #endif
    7113              : 
    7114     15551556 :     if (BufferIsLocal(buffer))
    7115              :     {
    7116       803533 :         *ptr = val;
    7117              : 
    7118       803533 :         MarkLocalBufferDirty(buffer);
    7119              : 
    7120       803533 :         return true;
    7121              :     }
    7122              : 
    7123     14748023 :     buf_hdr = GetBufferDescriptor(buffer - 1);
    7124              : 
    7125     14748023 :     if (SharedBufferBeginSetHintBits(buffer, buf_hdr, &lockstate))
    7126              :     {
    7127     14747950 :         *ptr = val;
    7128              : 
    7129     14747950 :         MarkSharedBufferDirtyHint(buffer, buf_hdr, lockstate, true);
    7130              : 
    7131     14747950 :         return true;
    7132              :     }
    7133              : 
    7134           73 :     return false;
    7135              : }
    7136              : 
    7137              : 
    7138              : /*
    7139              :  *  Functions for buffer I/O handling
    7140              :  *
    7141              :  *  Also note that these are used only for shared buffers, not local ones.
    7142              :  */
    7143              : 
    7144              : /*
    7145              :  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
    7146              :  */
    7147              : static void
    7148          259 : WaitIO(BufferDesc *buf)
    7149              : {
    7150          259 :     ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
    7151              : 
    7152              :     /*
    7153              :      * Should never end up here with unsubmitted IO, as no AIO unaware code
    7154              :      * may be used while in batch mode and AIO aware code needs to have
    7155              :      * submitted all staged IO to avoid deadlocks & slowness.
    7156              :      */
    7157              :     Assert(!pgaio_have_staged());
    7158              : 
    7159          259 :     ConditionVariablePrepareToSleep(cv);
    7160              :     for (;;)
    7161          259 :     {
    7162              :         uint64      buf_state;
    7163              :         PgAioWaitRef iow;
    7164              : 
    7165              :         /*
    7166              :          * It may not be necessary to acquire the spinlock to check the flag
    7167              :          * here, but since this test is essential for correctness, we'd better
    7168              :          * play it safe.
    7169              :          */
    7170          518 :         buf_state = LockBufHdr(buf);
    7171              : 
    7172              :         /*
    7173              :          * Copy the wait reference while holding the spinlock. This protects
    7174              :          * against a concurrent TerminateBufferIO() in another backend from
    7175              :          * clearing the wref while it's being read.
    7176              :          */
    7177          518 :         iow = buf->io_wref;
    7178          518 :         UnlockBufHdr(buf);
    7179              : 
    7180              :         /* no IO in progress, we don't need to wait */
    7181          518 :         if (!(buf_state & BM_IO_IN_PROGRESS))
    7182          259 :             break;
    7183              : 
    7184              :         /*
    7185              :          * The buffer has asynchronous IO in progress, wait for it to
    7186              :          * complete.
    7187              :          */
    7188          259 :         if (pgaio_wref_valid(&iow))
    7189              :         {
    7190           42 :             pgaio_wref_wait(&iow);
    7191              : 
    7192              :             /*
    7193              :              * The AIO subsystem internally uses condition variables and thus
    7194              :              * might remove this backend from the BufferDesc's CV. While that
    7195              :              * wouldn't cause a correctness issue (the first CV sleep just
    7196              :              * immediately returns if not already registered), it seems worth
    7197              :              * avoiding unnecessary loop iterations, given that we take care
    7198              :              * to do so at the start of the function.
    7199              :              */
    7200           42 :             ConditionVariablePrepareToSleep(cv);
    7201           42 :             continue;
    7202              :         }
    7203              : 
    7204              :         /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
    7205          217 :         ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
    7206              :     }
    7207          259 :     ConditionVariableCancelSleep();
    7208          259 : }
    7209              : 
    7210              : /*
    7211              :  * StartSharedBufferIO: begin I/O on this buffer
    7212              :  *  (Assumptions)
    7213              :  *  The buffer is Pinned
    7214              :  *
    7215              :  * In several scenarios the buffer may already be undergoing I/O in this or
    7216              :  * another backend. How to best handle that depends on the caller's
    7217              :  * situation. It might be appropriate to wait synchronously (e.g., because the
    7218              :  * buffer is about to be invalidated); wait asynchronously, using the buffer's
    7219              :  * IO wait reference (e.g., because the caller is doing readahead and doesn't
    7220              :  * need the buffer to be ready immediately); or to not wait at all (e.g.,
    7221              :  * because the caller is trying to combine IO for this buffer with another
    7222              :  * buffer).
    7223              :  *
    7224              :  * How and whether to wait is controlled by the wait and io_wref
    7225              :  * parameters. In detail:
    7226              :  *
    7227              :  * - If the caller passes a non-NULL io_wref and the buffer has an I/O wait
    7228              :  *   reference, the *io_wref is set to the buffer's io_wref and
    7229              :  *   BUFFER_IO_IN_PROGRESS is returned. This is done regardless of the wait
    7230              :  *   parameter.
    7231              :  *
    7232              :  * - If the caller passes a NULL io_wref (i.e. the caller does not want to
    7233              :  *   asynchronously wait for the completion of the IO), wait = false and the
    7234              :  *   buffer is undergoing IO, BUFFER_IO_IN_PROGRESS is returned.
    7235              :  *
    7236              :  * - If wait = true and either the buffer does not have a wait reference,
    7237              :  *   or the caller passes io_wref = NULL, WaitIO() is used to wait for the IO
    7238              :  *   to complete.  To avoid the potential of deadlocks and unnecessary delays,
    7239              :  *   all staged I/O is submitted before waiting.
    7240              :  *
    7241              :  * Input operations are only attempted on buffers that are not BM_VALID, and
    7242              :  * output operations only on buffers that are BM_VALID and BM_DIRTY, so we can
    7243              :  * always tell if the work is already done.  If no I/O is necessary,
    7244              :  * BUFFER_IO_ALREADY_DONE is returned.
    7245              :  *
    7246              :  * If we successfully marked the buffer as BM_IO_IN_PROGRESS,
    7247              :  * BUFFER_IO_READY_FOR_IO is returned.
    7248              :  */
    7249              : StartBufferIOResult
    7250      2942159 : StartSharedBufferIO(BufferDesc *buf, bool forInput, bool wait, PgAioWaitRef *io_wref)
    7251              : {
    7252              :     uint64      buf_state;
    7253              : 
    7254      2942159 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    7255              : 
    7256              :     for (;;)
    7257              :     {
    7258      2942417 :         buf_state = LockBufHdr(buf);
    7259              : 
    7260      2942417 :         if (!(buf_state & BM_IO_IN_PROGRESS))
    7261      2939445 :             break;
    7262              : 
    7263              :         /* Join the existing IO */
    7264         2972 :         if (io_wref != NULL && pgaio_wref_valid(&buf->io_wref))
    7265              :         {
    7266         2706 :             *io_wref = buf->io_wref;
    7267         2706 :             UnlockBufHdr(buf);
    7268              : 
    7269         2706 :             return BUFFER_IO_IN_PROGRESS;
    7270              :         }
    7271          266 :         else if (!wait)
    7272              :         {
    7273            8 :             UnlockBufHdr(buf);
    7274            8 :             return BUFFER_IO_IN_PROGRESS;
    7275              :         }
    7276              :         else
    7277              :         {
    7278              :             /*
    7279              :              * With wait = true, we always have to wait if the caller has
    7280              :              * passed io_wref = NULL.
    7281              :              *
    7282              :              * Even with io_wref != NULL, we have to wait if the buffer's wait
    7283              :              * ref is not valid but the IO is in progress, someone else
    7284              :              * started IO but hasn't set the wait ref yet. We have no choice
    7285              :              * but to wait until the IO completes.
    7286              :              */
    7287          258 :             UnlockBufHdr(buf);
    7288              : 
    7289              :             /*
    7290              :              * If this backend currently has staged IO, submit it before
    7291              :              * waiting for in-progress IO, to avoid potential deadlocks and
    7292              :              * unnecessary delays.
    7293              :              */
    7294          258 :             pgaio_submit_staged();
    7295              : 
    7296          258 :             WaitIO(buf);
    7297              :         }
    7298              :     }
    7299              : 
    7300              :     /* Once we get here, there is definitely no I/O active on this buffer */
    7301              : 
    7302              :     /* Check if someone else already did the I/O */
    7303      2939445 :     if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
    7304              :     {
    7305          472 :         UnlockBufHdr(buf);
    7306          472 :         return BUFFER_IO_ALREADY_DONE;
    7307              :     }
    7308              : 
    7309              :     /*
    7310              :      * No IO in progress and not already done; we will start IO. It's possible
    7311              :      * that the IO was in progress but we're not done, because the IO errored
    7312              :      * out. We'll do the IO ourselves.
    7313              :      */
    7314      2938973 :     UnlockBufHdrExt(buf, buf_state,
    7315              :                     BM_IO_IN_PROGRESS, 0,
    7316              :                     0);
    7317              : 
    7318      2938973 :     ResourceOwnerRememberBufferIO(CurrentResourceOwner,
    7319              :                                   BufferDescriptorGetBuffer(buf));
    7320              : 
    7321      2938973 :     return BUFFER_IO_READY_FOR_IO;
    7322              : }
    7323              : 
    7324              : /*
    7325              :  * Wrapper around StartSharedBufferIO / StartLocalBufferIO. Only to be used
    7326              :  * when the caller doesn't otherwise need to care about local vs shared. See
    7327              :  * StartSharedBufferIO() for details.
    7328              :  */
    7329              : StartBufferIOResult
    7330      1652949 : StartBufferIO(Buffer buffer, bool forInput, bool wait, PgAioWaitRef *io_wref)
    7331              : {
    7332              :     BufferDesc *buf_hdr;
    7333              : 
    7334      1652949 :     if (BufferIsLocal(buffer))
    7335              :     {
    7336        11013 :         buf_hdr = GetLocalBufferDescriptor(-buffer - 1);
    7337              : 
    7338        11013 :         return StartLocalBufferIO(buf_hdr, forInput, wait, io_wref);
    7339              :     }
    7340              :     else
    7341              :     {
    7342      1641936 :         buf_hdr = GetBufferDescriptor(buffer - 1);
    7343              : 
    7344      1641936 :         return StartSharedBufferIO(buf_hdr, forInput, wait, io_wref);
    7345              :     }
    7346              : }
    7347              : 
    7348              : /*
    7349              :  * TerminateBufferIO: release a buffer we were doing I/O on
    7350              :  *  (Assumptions)
    7351              :  *  My process is executing IO for the buffer
    7352              :  *  BM_IO_IN_PROGRESS bit is set for the buffer
    7353              :  *  The buffer is Pinned
    7354              :  *
    7355              :  * If clear_dirty is true, we clear the buffer's BM_DIRTY flag.  This is
    7356              :  * appropriate when terminating a successful write.
    7357              :  *
    7358              :  * set_flag_bits gets ORed into the buffer's flags.  It must include
    7359              :  * BM_IO_ERROR in a failure case.  For successful completion it could
    7360              :  * be 0, or BM_VALID if we just finished reading in the page.
    7361              :  *
    7362              :  * If forget_owner is true, we release the buffer I/O from the current
    7363              :  * resource owner. (forget_owner=false is used when the resource owner itself
    7364              :  * is being released)
    7365              :  */
    7366              : void
    7367      2768301 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits,
    7368              :                   bool forget_owner, bool release_aio)
    7369              : {
    7370              :     uint64      buf_state;
    7371      2768301 :     uint64      unset_flag_bits = 0;
    7372      2768301 :     int         refcount_change = 0;
    7373              : 
    7374      2768301 :     buf_state = LockBufHdr(buf);
    7375              : 
    7376              :     Assert(buf_state & BM_IO_IN_PROGRESS);
    7377      2768301 :     unset_flag_bits |= BM_IO_IN_PROGRESS;
    7378              : 
    7379              :     /* Clear earlier errors, if this IO failed, it'll be marked again */
    7380      2768301 :     unset_flag_bits |= BM_IO_ERROR;
    7381              : 
    7382      2768301 :     if (clear_dirty)
    7383       702089 :         unset_flag_bits |= BM_DIRTY | BM_CHECKPOINT_NEEDED;
    7384              : 
    7385      2768301 :     if (release_aio)
    7386              :     {
    7387              :         /* release ownership by the AIO subsystem */
    7388              :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    7389      1468188 :         refcount_change = -1;
    7390      1468188 :         pgaio_wref_clear(&buf->io_wref);
    7391              :     }
    7392              : 
    7393      2768301 :     buf_state = UnlockBufHdrExt(buf, buf_state,
    7394              :                                 set_flag_bits, unset_flag_bits,
    7395              :                                 refcount_change);
    7396              : 
    7397      2768301 :     if (forget_owner)
    7398      1300090 :         ResourceOwnerForgetBufferIO(CurrentResourceOwner,
    7399              :                                     BufferDescriptorGetBuffer(buf));
    7400              : 
    7401      2768301 :     ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
    7402              : 
    7403              :     /*
    7404              :      * Support LockBufferForCleanup()
    7405              :      *
    7406              :      * We may have just released the last pin other than the waiter's. In most
    7407              :      * cases, this backend holds another pin on the buffer. But, if, for
    7408              :      * example, this backend is completing an IO issued by another backend, it
    7409              :      * may be time to wake the waiter.
    7410              :      */
    7411      2768301 :     if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
    7412            0 :         WakePinCountWaiter(buf);
    7413      2768301 : }
    7414              : 
    7415              : /*
    7416              :  * AbortBufferIO: Clean up active buffer I/O after an error.
    7417              :  *
    7418              :  *  All LWLocks & content locks we might have held have been released, but we
    7419              :  *  haven't yet released buffer pins, so the buffer is still pinned.
    7420              :  *
    7421              :  *  If I/O was in progress, we always set BM_IO_ERROR, even though it's
    7422              :  *  possible the error condition wasn't related to the I/O.
    7423              :  *
    7424              :  *  Note: this does not remove the buffer I/O from the resource owner.
    7425              :  *  That's correct when we're releasing the whole resource owner, but
    7426              :  *  beware if you use this in other contexts.
    7427              :  */
    7428              : static void
    7429           15 : AbortBufferIO(Buffer buffer)
    7430              : {
    7431           15 :     BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
    7432              :     uint64      buf_state;
    7433              : 
    7434           15 :     buf_state = LockBufHdr(buf_hdr);
    7435              :     Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
    7436              : 
    7437           15 :     if (!(buf_state & BM_VALID))
    7438              :     {
    7439              :         Assert(!(buf_state & BM_DIRTY));
    7440           15 :         UnlockBufHdr(buf_hdr);
    7441              :     }
    7442              :     else
    7443              :     {
    7444              :         Assert(buf_state & BM_DIRTY);
    7445            0 :         UnlockBufHdr(buf_hdr);
    7446              : 
    7447              :         /* Issue notice if this is not the first failure... */
    7448            0 :         if (buf_state & BM_IO_ERROR)
    7449              :         {
    7450              :             /* Buffer is pinned, so we can read tag without spinlock */
    7451            0 :             ereport(WARNING,
    7452              :                     (errcode(ERRCODE_IO_ERROR),
    7453              :                      errmsg("could not write block %u of %s",
    7454              :                             buf_hdr->tag.blockNum,
    7455              :                             relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
    7456              :                                         BufTagGetForkNum(&buf_hdr->tag)).str),
    7457              :                      errdetail("Multiple failures --- write error might be permanent.")));
    7458              :         }
    7459              :     }
    7460              : 
    7461           15 :     TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
    7462           15 : }
    7463              : 
    7464              : /*
    7465              :  * Error context callback for errors occurring during shared buffer writes.
    7466              :  */
    7467              : static void
    7468           40 : shared_buffer_write_error_callback(void *arg)
    7469              : {
    7470           40 :     BufferDesc *bufHdr = (BufferDesc *) arg;
    7471              : 
    7472              :     /* Buffer is pinned, so we can read the tag without locking the spinlock */
    7473           40 :     if (bufHdr != NULL)
    7474           80 :         errcontext("writing block %u of relation \"%s\"",
    7475              :                    bufHdr->tag.blockNum,
    7476           40 :                    relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
    7477              :                                BufTagGetForkNum(&bufHdr->tag)).str);
    7478           40 : }
    7479              : 
    7480              : /*
    7481              :  * Error context callback for errors occurring during local buffer writes.
    7482              :  */
    7483              : static void
    7484            0 : local_buffer_write_error_callback(void *arg)
    7485              : {
    7486            0 :     BufferDesc *bufHdr = (BufferDesc *) arg;
    7487              : 
    7488            0 :     if (bufHdr != NULL)
    7489            0 :         errcontext("writing block %u of relation \"%s\"",
    7490              :                    bufHdr->tag.blockNum,
    7491            0 :                    relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
    7492              :                                   MyProcNumber,
    7493              :                                   BufTagGetForkNum(&bufHdr->tag)).str);
    7494            0 : }
    7495              : 
    7496              : /*
    7497              :  * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
    7498              :  */
    7499              : static int
    7500     13255218 : rlocator_comparator(const void *p1, const void *p2)
    7501              : {
    7502     13255218 :     RelFileLocator n1 = *(const RelFileLocator *) p1;
    7503     13255218 :     RelFileLocator n2 = *(const RelFileLocator *) p2;
    7504              : 
    7505     13255218 :     if (n1.relNumber < n2.relNumber)
    7506     13205379 :         return -1;
    7507        49839 :     else if (n1.relNumber > n2.relNumber)
    7508        48052 :         return 1;
    7509              : 
    7510         1787 :     if (n1.dbOid < n2.dbOid)
    7511            0 :         return -1;
    7512         1787 :     else if (n1.dbOid > n2.dbOid)
    7513            0 :         return 1;
    7514              : 
    7515         1787 :     if (n1.spcOid < n2.spcOid)
    7516            0 :         return -1;
    7517         1787 :     else if (n1.spcOid > n2.spcOid)
    7518            0 :         return 1;
    7519              :     else
    7520         1787 :         return 0;
    7521              : }
    7522              : 
    7523              : /*
    7524              :  * Lock buffer header - set BM_LOCKED in buffer state.
    7525              :  */
    7526              : uint64
    7527     28698886 : LockBufHdr(BufferDesc *desc)
    7528              : {
    7529              :     uint64      old_buf_state;
    7530              : 
    7531              :     Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
    7532              : 
    7533              :     while (true)
    7534              :     {
    7535              :         /*
    7536              :          * Always try once to acquire the lock directly, without setting up
    7537              :          * the spin-delay infrastructure. The work necessary for that shows up
    7538              :          * in profiles and is rarely necessary.
    7539              :          */
    7540     28699650 :         old_buf_state = pg_atomic_fetch_or_u64(&desc->state, BM_LOCKED);
    7541     28699650 :         if (likely(!(old_buf_state & BM_LOCKED)))
    7542     28698886 :             break;              /* got lock */
    7543              : 
    7544              :         /* and then spin without atomic operations until lock is released */
    7545              :         {
    7546              :             SpinDelayStatus delayStatus;
    7547              : 
    7548          764 :             init_local_spin_delay(&delayStatus);
    7549              : 
    7550         3917 :             while (old_buf_state & BM_LOCKED)
    7551              :             {
    7552         3153 :                 perform_spin_delay(&delayStatus);
    7553         3153 :                 old_buf_state = pg_atomic_read_u64(&desc->state);
    7554              :             }
    7555          764 :             finish_spin_delay(&delayStatus);
    7556              :         }
    7557              : 
    7558              :         /*
    7559              :          * Retry. The lock might obviously already be re-acquired by the time
    7560              :          * we're attempting to get it again.
    7561              :          */
    7562              :     }
    7563              : 
    7564     28698886 :     return old_buf_state | BM_LOCKED;
    7565              : }
    7566              : 
    7567              : /*
    7568              :  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
    7569              :  * state at that point.
    7570              :  *
    7571              :  * Obviously the buffer could be locked by the time the value is returned, so
    7572              :  * this is primarily useful in CAS style loops.
    7573              :  */
    7574              : pg_noinline uint64
    7575          633 : WaitBufHdrUnlocked(BufferDesc *buf)
    7576              : {
    7577              :     SpinDelayStatus delayStatus;
    7578              :     uint64      buf_state;
    7579              : 
    7580          633 :     init_local_spin_delay(&delayStatus);
    7581              : 
    7582          633 :     buf_state = pg_atomic_read_u64(&buf->state);
    7583              : 
    7584         2834 :     while (buf_state & BM_LOCKED)
    7585              :     {
    7586         2201 :         perform_spin_delay(&delayStatus);
    7587         2201 :         buf_state = pg_atomic_read_u64(&buf->state);
    7588              :     }
    7589              : 
    7590          633 :     finish_spin_delay(&delayStatus);
    7591              : 
    7592          633 :     return buf_state;
    7593              : }
    7594              : 
    7595              : /*
    7596              :  * BufferTag comparator.
    7597              :  */
    7598              : static inline int
    7599            0 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
    7600              : {
    7601              :     int         ret;
    7602              :     RelFileLocator rlocatora;
    7603              :     RelFileLocator rlocatorb;
    7604              : 
    7605            0 :     rlocatora = BufTagGetRelFileLocator(ba);
    7606            0 :     rlocatorb = BufTagGetRelFileLocator(bb);
    7607              : 
    7608            0 :     ret = rlocator_comparator(&rlocatora, &rlocatorb);
    7609              : 
    7610            0 :     if (ret != 0)
    7611            0 :         return ret;
    7612              : 
    7613            0 :     if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
    7614            0 :         return -1;
    7615            0 :     if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
    7616            0 :         return 1;
    7617              : 
    7618            0 :     if (ba->blockNum < bb->blockNum)
    7619            0 :         return -1;
    7620            0 :     if (ba->blockNum > bb->blockNum)
    7621            0 :         return 1;
    7622              : 
    7623            0 :     return 0;
    7624              : }
    7625              : 
    7626              : /*
    7627              :  * Comparator determining the writeout order in a checkpoint.
    7628              :  *
    7629              :  * It is important that tablespaces are compared first, the logic balancing
    7630              :  * writes between tablespaces relies on it.
    7631              :  */
    7632              : static inline int
    7633      3475280 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
    7634              : {
    7635              :     /* compare tablespace */
    7636      3475280 :     if (a->tsId < b->tsId)
    7637         7672 :         return -1;
    7638      3467608 :     else if (a->tsId > b->tsId)
    7639        28601 :         return 1;
    7640              :     /* compare relation */
    7641      3439007 :     if (a->relNumber < b->relNumber)
    7642       974759 :         return -1;
    7643      2464248 :     else if (a->relNumber > b->relNumber)
    7644       943158 :         return 1;
    7645              :     /* compare fork */
    7646      1521090 :     else if (a->forkNum < b->forkNum)
    7647        62371 :         return -1;
    7648      1458719 :     else if (a->forkNum > b->forkNum)
    7649        72514 :         return 1;
    7650              :     /* compare block number */
    7651      1386205 :     else if (a->blockNum < b->blockNum)
    7652       676318 :         return -1;
    7653       709887 :     else if (a->blockNum > b->blockNum)
    7654       656856 :         return 1;
    7655              :     /* equal page IDs are unlikely, but not impossible */
    7656        53031 :     return 0;
    7657              : }
    7658              : 
    7659              : /*
    7660              :  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
    7661              :  * progress.
    7662              :  */
    7663              : static int
    7664       288536 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
    7665              : {
    7666       288536 :     CkptTsStatus *sa = (CkptTsStatus *) DatumGetPointer(a);
    7667       288536 :     CkptTsStatus *sb = (CkptTsStatus *) DatumGetPointer(b);
    7668              : 
    7669              :     /* we want a min-heap, so return 1 for the a < b */
    7670       288536 :     if (sa->progress < sb->progress)
    7671       259945 :         return 1;
    7672        28591 :     else if (sa->progress == sb->progress)
    7673         1878 :         return 0;
    7674              :     else
    7675        26713 :         return -1;
    7676              : }
    7677              : 
    7678              : /*
    7679              :  * Initialize a writeback context, discarding potential previous state.
    7680              :  *
    7681              :  * *max_pending is a pointer instead of an immediate value, so the coalesce
    7682              :  * limits can easily changed by the GUC mechanism, and so calling code does
    7683              :  * not have to check the current configuration. A value of 0 means that no
    7684              :  * writeback control will be performed.
    7685              :  */
    7686              : void
    7687         3100 : WritebackContextInit(WritebackContext *context, int *max_pending)
    7688              : {
    7689              :     Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
    7690              : 
    7691         3100 :     context->max_pending = max_pending;
    7692         3100 :     context->nr_pending = 0;
    7693         3100 : }
    7694              : 
    7695              : /*
    7696              :  * Add buffer to list of pending writeback requests.
    7697              :  */
    7698              : void
    7699       698440 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
    7700              :                               BufferTag *tag)
    7701              : {
    7702              :     PendingWriteback *pending;
    7703              : 
    7704              :     /*
    7705              :      * As pg_flush_data() doesn't do anything with fsync disabled, there's no
    7706              :      * point in tracking in that case.
    7707              :      */
    7708       698440 :     if (io_direct_flags & IO_DIRECT_DATA ||
    7709       697910 :         !enableFsync)
    7710       698435 :         return;
    7711              : 
    7712              :     /*
    7713              :      * Add buffer to the pending writeback array, unless writeback control is
    7714              :      * disabled.
    7715              :      */
    7716            5 :     if (*wb_context->max_pending > 0)
    7717              :     {
    7718              :         Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
    7719              : 
    7720            0 :         pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
    7721              : 
    7722            0 :         pending->tag = *tag;
    7723              :     }
    7724              : 
    7725              :     /*
    7726              :      * Perform pending flushes if the writeback limit is exceeded. This
    7727              :      * includes the case where previously an item has been added, but control
    7728              :      * is now disabled.
    7729              :      */
    7730            5 :     if (wb_context->nr_pending >= *wb_context->max_pending)
    7731            5 :         IssuePendingWritebacks(wb_context, io_context);
    7732              : }
    7733              : 
    7734              : #define ST_SORT sort_pending_writebacks
    7735              : #define ST_ELEMENT_TYPE PendingWriteback
    7736              : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
    7737              : #define ST_SCOPE static
    7738              : #define ST_DEFINE
    7739              : #include "lib/sort_template.h"
    7740              : 
    7741              : /*
    7742              :  * Issue all pending writeback requests, previously scheduled with
    7743              :  * ScheduleBufferTagForWriteback, to the OS.
    7744              :  *
    7745              :  * Because this is only used to improve the OSs IO scheduling we try to never
    7746              :  * error out - it's just a hint.
    7747              :  */
    7748              : void
    7749         1214 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
    7750              : {
    7751              :     instr_time  io_start;
    7752              :     int         i;
    7753              : 
    7754         1214 :     if (wb_context->nr_pending == 0)
    7755         1214 :         return;
    7756              : 
    7757              :     /*
    7758              :      * Executing the writes in-order can make them a lot faster, and allows to
    7759              :      * merge writeback requests to consecutive blocks into larger writebacks.
    7760              :      */
    7761            0 :     sort_pending_writebacks(wb_context->pending_writebacks,
    7762            0 :                             wb_context->nr_pending);
    7763              : 
    7764            0 :     io_start = pgstat_prepare_io_time(track_io_timing);
    7765              : 
    7766              :     /*
    7767              :      * Coalesce neighbouring writes, but nothing else. For that we iterate
    7768              :      * through the, now sorted, array of pending flushes, and look forward to
    7769              :      * find all neighbouring (or identical) writes.
    7770              :      */
    7771            0 :     for (i = 0; i < wb_context->nr_pending; i++)
    7772              :     {
    7773              :         PendingWriteback *cur;
    7774              :         PendingWriteback *next;
    7775              :         SMgrRelation reln;
    7776              :         int         ahead;
    7777              :         BufferTag   tag;
    7778              :         RelFileLocator currlocator;
    7779            0 :         Size        nblocks = 1;
    7780              : 
    7781            0 :         cur = &wb_context->pending_writebacks[i];
    7782            0 :         tag = cur->tag;
    7783            0 :         currlocator = BufTagGetRelFileLocator(&tag);
    7784              : 
    7785              :         /*
    7786              :          * Peek ahead, into following writeback requests, to see if they can
    7787              :          * be combined with the current one.
    7788              :          */
    7789            0 :         for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
    7790              :         {
    7791              : 
    7792            0 :             next = &wb_context->pending_writebacks[i + ahead + 1];
    7793              : 
    7794              :             /* different file, stop */
    7795            0 :             if (!RelFileLocatorEquals(currlocator,
    7796            0 :                                       BufTagGetRelFileLocator(&next->tag)) ||
    7797            0 :                 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
    7798              :                 break;
    7799              : 
    7800              :             /* ok, block queued twice, skip */
    7801            0 :             if (cur->tag.blockNum == next->tag.blockNum)
    7802            0 :                 continue;
    7803              : 
    7804              :             /* only merge consecutive writes */
    7805            0 :             if (cur->tag.blockNum + 1 != next->tag.blockNum)
    7806            0 :                 break;
    7807              : 
    7808            0 :             nblocks++;
    7809            0 :             cur = next;
    7810              :         }
    7811              : 
    7812            0 :         i += ahead;
    7813              : 
    7814              :         /* and finally tell the kernel to write the data to storage */
    7815            0 :         reln = smgropen(currlocator, INVALID_PROC_NUMBER);
    7816            0 :         smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
    7817              :     }
    7818              : 
    7819              :     /*
    7820              :      * Assume that writeback requests are only issued for buffers containing
    7821              :      * blocks of permanent relations.
    7822              :      */
    7823            0 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
    7824            0 :                             IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
    7825              : 
    7826            0 :     wb_context->nr_pending = 0;
    7827              : }
    7828              : 
    7829              : /* ResourceOwner callbacks */
    7830              : 
    7831              : static void
    7832           15 : ResOwnerReleaseBufferIO(Datum res)
    7833              : {
    7834           15 :     Buffer      buffer = DatumGetInt32(res);
    7835              : 
    7836           15 :     AbortBufferIO(buffer);
    7837           15 : }
    7838              : 
    7839              : static char *
    7840            0 : ResOwnerPrintBufferIO(Datum res)
    7841              : {
    7842            0 :     Buffer      buffer = DatumGetInt32(res);
    7843              : 
    7844            0 :     return psprintf("lost track of buffer IO on buffer %d", buffer);
    7845              : }
    7846              : 
    7847              : /*
    7848              :  * Release buffer as part of resource owner cleanup. This will only be called
    7849              :  * if the buffer is pinned. If this backend held the content lock at the time
    7850              :  * of the error we also need to release that (note that it is not possible to
    7851              :  * hold a content lock without a pin).
    7852              :  */
    7853              : static void
    7854        10547 : ResOwnerReleaseBuffer(Datum res)
    7855              : {
    7856        10547 :     Buffer      buffer = DatumGetInt32(res);
    7857              : 
    7858              :     /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
    7859        10547 :     if (!BufferIsValid(buffer))
    7860            0 :         elog(ERROR, "bad buffer ID: %d", buffer);
    7861              : 
    7862        10547 :     if (BufferIsLocal(buffer))
    7863         3992 :         UnpinLocalBufferNoOwner(buffer);
    7864              :     else
    7865              :     {
    7866              :         PrivateRefCountEntry *ref;
    7867              : 
    7868         6555 :         ref = GetPrivateRefCountEntry(buffer, false);
    7869              : 
    7870              :         /* not having a private refcount would imply resowner corruption */
    7871              :         Assert(ref != NULL);
    7872              : 
    7873              :         /*
    7874              :          * If the buffer was locked at the time of the resowner release,
    7875              :          * release the lock now. This should only happen after errors.
    7876              :          */
    7877         6555 :         if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
    7878              :         {
    7879          114 :             BufferDesc *buf = GetBufferDescriptor(buffer - 1);
    7880              : 
    7881          114 :             HOLD_INTERRUPTS();  /* match the upcoming RESUME_INTERRUPTS */
    7882          114 :             BufferLockUnlock(buffer, buf);
    7883              :         }
    7884              : 
    7885         6555 :         UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
    7886              :     }
    7887        10547 : }
    7888              : 
    7889              : static char *
    7890            0 : ResOwnerPrintBuffer(Datum res)
    7891              : {
    7892            0 :     return DebugPrintBufferRefcount(DatumGetInt32(res));
    7893              : }
    7894              : 
    7895              : /*
    7896              :  * Helper function to evict unpinned buffer whose buffer header lock is
    7897              :  * already acquired.
    7898              :  */
    7899              : static bool
    7900         2509 : EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
    7901              : {
    7902              :     uint64      buf_state;
    7903              :     bool        result;
    7904              : 
    7905         2509 :     *buffer_flushed = false;
    7906              : 
    7907         2509 :     buf_state = pg_atomic_read_u64(&(desc->state));
    7908              :     Assert(buf_state & BM_LOCKED);
    7909              : 
    7910         2509 :     if ((buf_state & BM_VALID) == 0)
    7911              :     {
    7912            0 :         UnlockBufHdr(desc);
    7913            0 :         return false;
    7914              :     }
    7915              : 
    7916              :     /* Check that it's not pinned already. */
    7917         2509 :     if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
    7918              :     {
    7919            0 :         UnlockBufHdr(desc);
    7920            0 :         return false;
    7921              :     }
    7922              : 
    7923         2509 :     PinBuffer_Locked(desc);     /* releases spinlock */
    7924              : 
    7925              :     /* If it was dirty, try to clean it once. */
    7926         2509 :     if (buf_state & BM_DIRTY)
    7927              :     {
    7928         1075 :         FlushUnlockedBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    7929         1075 :         *buffer_flushed = true;
    7930              :     }
    7931              : 
    7932              :     /* This will return false if it becomes dirty or someone else pins it. */
    7933         2509 :     result = InvalidateVictimBuffer(desc);
    7934              : 
    7935         2509 :     UnpinBuffer(desc);
    7936              : 
    7937         2509 :     return result;
    7938              : }
    7939              : 
    7940              : /*
    7941              :  * Try to evict the current block in a shared buffer.
    7942              :  *
    7943              :  * This function is intended for testing/development use only!
    7944              :  *
    7945              :  * To succeed, the buffer must not be pinned on entry, so if the caller had a
    7946              :  * particular block in mind, it might already have been replaced by some other
    7947              :  * block by the time this function runs.  It's also unpinned on return, so the
    7948              :  * buffer might be occupied again by the time control is returned, potentially
    7949              :  * even by the same block.  This inherent raciness without other interlocking
    7950              :  * makes the function unsuitable for non-testing usage.
    7951              :  *
    7952              :  * *buffer_flushed is set to true if the buffer was dirty and has been
    7953              :  * flushed, false otherwise.  However, *buffer_flushed=true does not
    7954              :  * necessarily mean that we flushed the buffer, it could have been flushed by
    7955              :  * someone else.
    7956              :  *
    7957              :  * Returns true if the buffer was valid and it has now been made invalid.
    7958              :  * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
    7959              :  * or if the buffer becomes dirty again while we're trying to write it out.
    7960              :  */
    7961              : bool
    7962          137 : EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
    7963              : {
    7964              :     BufferDesc *desc;
    7965              : 
    7966              :     Assert(BufferIsValid(buf) && !BufferIsLocal(buf));
    7967              : 
    7968              :     /* Make sure we can pin the buffer. */
    7969          137 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    7970          137 :     ReservePrivateRefCountEntry();
    7971              : 
    7972          137 :     desc = GetBufferDescriptor(buf - 1);
    7973          137 :     LockBufHdr(desc);
    7974              : 
    7975          137 :     return EvictUnpinnedBufferInternal(desc, buffer_flushed);
    7976              : }
    7977              : 
    7978              : /*
    7979              :  * Try to evict all the shared buffers.
    7980              :  *
    7981              :  * This function is intended for testing/development use only! See
    7982              :  * EvictUnpinnedBuffer().
    7983              :  *
    7984              :  * The buffers_* parameters are mandatory and indicate the total count of
    7985              :  * buffers that:
    7986              :  * - buffers_evicted - were evicted
    7987              :  * - buffers_flushed - were flushed
    7988              :  * - buffers_skipped - could not be evicted
    7989              :  */
    7990              : void
    7991            1 : EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
    7992              :                         int32 *buffers_skipped)
    7993              : {
    7994            1 :     *buffers_evicted = 0;
    7995            1 :     *buffers_skipped = 0;
    7996            1 :     *buffers_flushed = 0;
    7997              : 
    7998        16385 :     for (int buf = 1; buf <= NBuffers; buf++)
    7999              :     {
    8000        16384 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
    8001              :         uint64      buf_state;
    8002              :         bool        buffer_flushed;
    8003              : 
    8004        16384 :         CHECK_FOR_INTERRUPTS();
    8005              : 
    8006        16384 :         buf_state = pg_atomic_read_u64(&desc->state);
    8007        16384 :         if (!(buf_state & BM_VALID))
    8008        14358 :             continue;
    8009              : 
    8010         2026 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    8011         2026 :         ReservePrivateRefCountEntry();
    8012              : 
    8013         2026 :         LockBufHdr(desc);
    8014              : 
    8015         2026 :         if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
    8016         2026 :             (*buffers_evicted)++;
    8017              :         else
    8018            0 :             (*buffers_skipped)++;
    8019              : 
    8020         2026 :         if (buffer_flushed)
    8021          978 :             (*buffers_flushed)++;
    8022              :     }
    8023            1 : }
    8024              : 
    8025              : /*
    8026              :  * Try to evict all the shared buffers containing provided relation's pages.
    8027              :  *
    8028              :  * This function is intended for testing/development use only! See
    8029              :  * EvictUnpinnedBuffer().
    8030              :  *
    8031              :  * The caller must hold at least AccessShareLock on the relation to prevent
    8032              :  * the relation from being dropped.
    8033              :  *
    8034              :  * The buffers_* parameters are mandatory and indicate the total count of
    8035              :  * buffers that:
    8036              :  * - buffers_evicted - were evicted
    8037              :  * - buffers_flushed - were flushed
    8038              :  * - buffers_skipped - could not be evicted
    8039              :  */
    8040              : void
    8041           29 : EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted,
    8042              :                         int32 *buffers_flushed, int32 *buffers_skipped)
    8043              : {
    8044              :     Assert(!RelationUsesLocalBuffers(rel));
    8045              : 
    8046           29 :     *buffers_skipped = 0;
    8047           29 :     *buffers_evicted = 0;
    8048           29 :     *buffers_flushed = 0;
    8049              : 
    8050       475165 :     for (int buf = 1; buf <= NBuffers; buf++)
    8051              :     {
    8052       475136 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
    8053       475136 :         uint64      buf_state = pg_atomic_read_u64(&(desc->state));
    8054              :         bool        buffer_flushed;
    8055              : 
    8056       475136 :         CHECK_FOR_INTERRUPTS();
    8057              : 
    8058              :         /* An unlocked precheck should be safe and saves some cycles. */
    8059       475136 :         if ((buf_state & BM_VALID) == 0 ||
    8060        62749 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    8061       474790 :             continue;
    8062              : 
    8063              :         /* Make sure we can pin the buffer. */
    8064          346 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    8065          346 :         ReservePrivateRefCountEntry();
    8066              : 
    8067          346 :         buf_state = LockBufHdr(desc);
    8068              : 
    8069              :         /* recheck, could have changed without the lock */
    8070          346 :         if ((buf_state & BM_VALID) == 0 ||
    8071          346 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    8072              :         {
    8073            0 :             UnlockBufHdr(desc);
    8074            0 :             continue;
    8075              :         }
    8076              : 
    8077          346 :         if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
    8078          346 :             (*buffers_evicted)++;
    8079              :         else
    8080            0 :             (*buffers_skipped)++;
    8081              : 
    8082          346 :         if (buffer_flushed)
    8083           78 :             (*buffers_flushed)++;
    8084              :     }
    8085           29 : }
    8086              : 
    8087              : /*
    8088              :  * Helper function to mark unpinned buffer dirty whose buffer header lock is
    8089              :  * already acquired.
    8090              :  */
    8091              : static bool
    8092           36 : MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc,
    8093              :                                 bool *buffer_already_dirty)
    8094              : {
    8095              :     uint64      buf_state;
    8096           36 :     bool        result = false;
    8097              : 
    8098           36 :     *buffer_already_dirty = false;
    8099              : 
    8100           36 :     buf_state = pg_atomic_read_u64(&(desc->state));
    8101              :     Assert(buf_state & BM_LOCKED);
    8102              : 
    8103           36 :     if ((buf_state & BM_VALID) == 0)
    8104              :     {
    8105            1 :         UnlockBufHdr(desc);
    8106            1 :         return false;
    8107              :     }
    8108              : 
    8109              :     /* Check that it's not pinned already. */
    8110           35 :     if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
    8111              :     {
    8112            0 :         UnlockBufHdr(desc);
    8113            0 :         return false;
    8114              :     }
    8115              : 
    8116              :     /* Pin the buffer and then release the buffer spinlock */
    8117           35 :     PinBuffer_Locked(desc);
    8118              : 
    8119              :     /* If it was not already dirty, mark it as dirty. */
    8120           35 :     if (!(buf_state & BM_DIRTY))
    8121              :     {
    8122           16 :         BufferLockAcquire(buf, desc, BUFFER_LOCK_EXCLUSIVE);
    8123           16 :         MarkBufferDirty(buf);
    8124           16 :         result = true;
    8125           16 :         BufferLockUnlock(buf, desc);
    8126              :     }
    8127              :     else
    8128           19 :         *buffer_already_dirty = true;
    8129              : 
    8130           35 :     UnpinBuffer(desc);
    8131              : 
    8132           35 :     return result;
    8133              : }
    8134              : 
    8135              : /*
    8136              :  * Try to mark the provided shared buffer as dirty.
    8137              :  *
    8138              :  * This function is intended for testing/development use only!
    8139              :  *
    8140              :  * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
    8141              :  *
    8142              :  * The buffer_already_dirty parameter is mandatory and indicate if the buffer
    8143              :  * could not be dirtied because it is already dirty.
    8144              :  *
    8145              :  * Returns true if the buffer has successfully been marked as dirty.
    8146              :  */
    8147              : bool
    8148            1 : MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
    8149              : {
    8150              :     BufferDesc *desc;
    8151            1 :     bool        buffer_dirtied = false;
    8152              : 
    8153              :     Assert(!BufferIsLocal(buf));
    8154              : 
    8155              :     /* Make sure we can pin the buffer. */
    8156            1 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    8157            1 :     ReservePrivateRefCountEntry();
    8158              : 
    8159            1 :     desc = GetBufferDescriptor(buf - 1);
    8160            1 :     LockBufHdr(desc);
    8161              : 
    8162            1 :     buffer_dirtied = MarkDirtyUnpinnedBufferInternal(buf, desc, buffer_already_dirty);
    8163              :     /* Both can not be true at the same time */
    8164              :     Assert(!(buffer_dirtied && *buffer_already_dirty));
    8165              : 
    8166            1 :     return buffer_dirtied;
    8167              : }
    8168              : 
    8169              : /*
    8170              :  * Try to mark all the shared buffers containing provided relation's pages as
    8171              :  * dirty.
    8172              :  *
    8173              :  * This function is intended for testing/development use only! See
    8174              :  * MarkDirtyUnpinnedBuffer().
    8175              :  *
    8176              :  * The buffers_* parameters are mandatory and indicate the total count of
    8177              :  * buffers that:
    8178              :  * - buffers_dirtied - were dirtied
    8179              :  * - buffers_already_dirty - were already dirty
    8180              :  * - buffers_skipped - could not be dirtied because of a reason different
    8181              :  * than a buffer being already dirty.
    8182              :  */
    8183              : void
    8184            1 : MarkDirtyRelUnpinnedBuffers(Relation rel,
    8185              :                             int32 *buffers_dirtied,
    8186              :                             int32 *buffers_already_dirty,
    8187              :                             int32 *buffers_skipped)
    8188              : {
    8189              :     Assert(!RelationUsesLocalBuffers(rel));
    8190              : 
    8191            1 :     *buffers_dirtied = 0;
    8192            1 :     *buffers_already_dirty = 0;
    8193            1 :     *buffers_skipped = 0;
    8194              : 
    8195        16385 :     for (int buf = 1; buf <= NBuffers; buf++)
    8196              :     {
    8197        16384 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
    8198        16384 :         uint64      buf_state = pg_atomic_read_u64(&(desc->state));
    8199              :         bool        buffer_already_dirty;
    8200              : 
    8201        16384 :         CHECK_FOR_INTERRUPTS();
    8202              : 
    8203              :         /* An unlocked precheck should be safe and saves some cycles. */
    8204        16384 :         if ((buf_state & BM_VALID) == 0 ||
    8205           27 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    8206        16384 :             continue;
    8207              : 
    8208              :         /* Make sure we can pin the buffer. */
    8209            0 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    8210            0 :         ReservePrivateRefCountEntry();
    8211              : 
    8212            0 :         buf_state = LockBufHdr(desc);
    8213              : 
    8214              :         /* recheck, could have changed without the lock */
    8215            0 :         if ((buf_state & BM_VALID) == 0 ||
    8216            0 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    8217              :         {
    8218            0 :             UnlockBufHdr(desc);
    8219            0 :             continue;
    8220              :         }
    8221              : 
    8222            0 :         if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
    8223            0 :             (*buffers_dirtied)++;
    8224            0 :         else if (buffer_already_dirty)
    8225            0 :             (*buffers_already_dirty)++;
    8226              :         else
    8227            0 :             (*buffers_skipped)++;
    8228              :     }
    8229            1 : }
    8230              : 
    8231              : /*
    8232              :  * Try to mark all the shared buffers as dirty.
    8233              :  *
    8234              :  * This function is intended for testing/development use only! See
    8235              :  * MarkDirtyUnpinnedBuffer().
    8236              :  *
    8237              :  * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
    8238              :  * parameters.
    8239              :  */
    8240              : void
    8241            1 : MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied,
    8242              :                             int32 *buffers_already_dirty,
    8243              :                             int32 *buffers_skipped)
    8244              : {
    8245            1 :     *buffers_dirtied = 0;
    8246            1 :     *buffers_already_dirty = 0;
    8247            1 :     *buffers_skipped = 0;
    8248              : 
    8249        16385 :     for (int buf = 1; buf <= NBuffers; buf++)
    8250              :     {
    8251        16384 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
    8252              :         uint64      buf_state;
    8253              :         bool        buffer_already_dirty;
    8254              : 
    8255        16384 :         CHECK_FOR_INTERRUPTS();
    8256              : 
    8257        16384 :         buf_state = pg_atomic_read_u64(&desc->state);
    8258        16384 :         if (!(buf_state & BM_VALID))
    8259        16349 :             continue;
    8260              : 
    8261           35 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    8262           35 :         ReservePrivateRefCountEntry();
    8263              : 
    8264           35 :         LockBufHdr(desc);
    8265              : 
    8266           35 :         if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
    8267           16 :             (*buffers_dirtied)++;
    8268           19 :         else if (buffer_already_dirty)
    8269           19 :             (*buffers_already_dirty)++;
    8270              :         else
    8271            0 :             (*buffers_skipped)++;
    8272              :     }
    8273            1 : }
    8274              : 
    8275              : /*
    8276              :  * Generic implementation of the AIO handle staging callback for readv/writev
    8277              :  * on local/shared buffers.
    8278              :  *
    8279              :  * Each readv/writev can target multiple buffers. The buffers have already
    8280              :  * been registered with the IO handle.
    8281              :  *
    8282              :  * To make the IO ready for execution ("staging"), we need to ensure that the
    8283              :  * targeted buffers are in an appropriate state while the IO is ongoing. For
    8284              :  * that the AIO subsystem needs to have its own buffer pin, otherwise an error
    8285              :  * in this backend could lead to this backend's buffer pin being released as
    8286              :  * part of error handling, which in turn could lead to the buffer being
    8287              :  * replaced while IO is ongoing.
    8288              :  */
    8289              : static pg_attribute_always_inline void
    8290      1464927 : buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
    8291              : {
    8292              :     uint64     *io_data;
    8293              :     uint8       handle_data_len;
    8294              :     PgAioWaitRef io_ref;
    8295      1464927 :     BufferTag   first PG_USED_FOR_ASSERTS_ONLY = {0};
    8296              : 
    8297      1464927 :     io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
    8298              : 
    8299      1464927 :     pgaio_io_get_wref(ioh, &io_ref);
    8300              : 
    8301              :     /* iterate over all buffers affected by the vectored readv/writev */
    8302      3114846 :     for (int i = 0; i < handle_data_len; i++)
    8303              :     {
    8304      1649919 :         Buffer      buffer = (Buffer) io_data[i];
    8305      1649919 :         BufferDesc *buf_hdr = is_temp ?
    8306        11059 :             GetLocalBufferDescriptor(-buffer - 1)
    8307      1649919 :             : GetBufferDescriptor(buffer - 1);
    8308              :         uint64      buf_state;
    8309              : 
    8310              :         /*
    8311              :          * Check that all the buffers are actually ones that could conceivably
    8312              :          * be done in one IO, i.e. are sequential. This is the last
    8313              :          * buffer-aware code before IO is actually executed and confusion
    8314              :          * about which buffers are targeted by IO can be hard to debug, making
    8315              :          * it worth doing extra-paranoid checks.
    8316              :          */
    8317      1649919 :         if (i == 0)
    8318      1464927 :             first = buf_hdr->tag;
    8319              :         else
    8320              :         {
    8321              :             Assert(buf_hdr->tag.relNumber == first.relNumber);
    8322              :             Assert(buf_hdr->tag.blockNum == first.blockNum + i);
    8323              :         }
    8324              : 
    8325      1649919 :         if (is_temp)
    8326        11059 :             buf_state = pg_atomic_read_u64(&buf_hdr->state);
    8327              :         else
    8328      1638860 :             buf_state = LockBufHdr(buf_hdr);
    8329              : 
    8330              :         /* verify the buffer is in the expected state */
    8331              :         Assert(buf_state & BM_TAG_VALID);
    8332              :         if (is_write)
    8333              :         {
    8334              :             Assert(buf_state & BM_VALID);
    8335              :             Assert(buf_state & BM_DIRTY);
    8336              :         }
    8337              :         else
    8338              :         {
    8339              :             Assert(!(buf_state & BM_VALID));
    8340              :             Assert(!(buf_state & BM_DIRTY));
    8341              :         }
    8342              : 
    8343              :         /* temp buffers don't use BM_IO_IN_PROGRESS */
    8344      1649919 :         if (!is_temp)
    8345              :             Assert(buf_state & BM_IO_IN_PROGRESS);
    8346              : 
    8347              :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
    8348              : 
    8349              :         /*
    8350              :          * Reflect that the buffer is now owned by the AIO subsystem.
    8351              :          *
    8352              :          * For local buffers: This can't be done just via LocalRefCount, as
    8353              :          * one might initially think, as this backend could error out while
    8354              :          * AIO is still in progress, releasing all the pins by the backend
    8355              :          * itself.
    8356              :          *
    8357              :          * This pin is released again in TerminateBufferIO().
    8358              :          */
    8359      1649919 :         buf_hdr->io_wref = io_ref;
    8360              : 
    8361      1649919 :         if (is_temp)
    8362              :         {
    8363        11059 :             buf_state += BUF_REFCOUNT_ONE;
    8364        11059 :             pg_atomic_unlocked_write_u64(&buf_hdr->state, buf_state);
    8365              :         }
    8366              :         else
    8367      1638860 :             UnlockBufHdrExt(buf_hdr, buf_state, 0, 0, 1);
    8368              : 
    8369              :         /*
    8370              :          * Ensure the content lock that prevents buffer modifications while
    8371              :          * the buffer is being written out is not released early due to an
    8372              :          * error.
    8373              :          */
    8374      1649919 :         if (is_write && !is_temp)
    8375              :         {
    8376              :             Assert(BufferLockHeldByMe(buf_hdr));
    8377              : 
    8378              :             /*
    8379              :              * Lock is now owned by AIO subsystem.
    8380              :              */
    8381            0 :             BufferLockDisown(buffer, buf_hdr);
    8382              :         }
    8383              : 
    8384              :         /*
    8385              :          * Stop tracking this buffer via the resowner - the AIO system now
    8386              :          * keeps track.
    8387              :          */
    8388      1649919 :         if (!is_temp)
    8389      1638860 :             ResourceOwnerForgetBufferIO(CurrentResourceOwner, buffer);
    8390              :     }
    8391      1464927 : }
    8392              : 
    8393              : /*
    8394              :  * Decode readv errors as encoded by buffer_readv_encode_error().
    8395              :  */
    8396              : static inline void
    8397          454 : buffer_readv_decode_error(PgAioResult result,
    8398              :                           bool *zeroed_any,
    8399              :                           bool *ignored_any,
    8400              :                           uint8 *zeroed_or_error_count,
    8401              :                           uint8 *checkfail_count,
    8402              :                           uint8 *first_off)
    8403              : {
    8404          454 :     uint32      rem_error = result.error_data;
    8405              : 
    8406              :     /* see static asserts in buffer_readv_encode_error */
    8407              : #define READV_COUNT_BITS    7
    8408              : #define READV_COUNT_MASK    ((1 << READV_COUNT_BITS) - 1)
    8409              : 
    8410          454 :     *zeroed_any = rem_error & 1;
    8411          454 :     rem_error >>= 1;
    8412              : 
    8413          454 :     *ignored_any = rem_error & 1;
    8414          454 :     rem_error >>= 1;
    8415              : 
    8416          454 :     *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
    8417          454 :     rem_error >>= READV_COUNT_BITS;
    8418              : 
    8419          454 :     *checkfail_count = rem_error & READV_COUNT_MASK;
    8420          454 :     rem_error >>= READV_COUNT_BITS;
    8421              : 
    8422          454 :     *first_off = rem_error & READV_COUNT_MASK;
    8423          454 :     rem_error >>= READV_COUNT_BITS;
    8424          454 : }
    8425              : 
    8426              : /*
    8427              :  * Helper to encode errors for buffer_readv_complete()
    8428              :  *
    8429              :  * Errors are encoded as follows:
    8430              :  * - bit 0 indicates whether any page was zeroed (1) or not (0)
    8431              :  * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
    8432              :  * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
    8433              :  * - next READV_COUNT_BITS bits indicate the number of checksum failures
    8434              :  * - next READV_COUNT_BITS bits indicate the first offset of the first page
    8435              :  *   that was errored or zeroed or, if no errors/zeroes, the first ignored
    8436              :  *   checksum
    8437              :  */
    8438              : static inline void
    8439          194 : buffer_readv_encode_error(PgAioResult *result,
    8440              :                           bool is_temp,
    8441              :                           bool zeroed_any,
    8442              :                           bool ignored_any,
    8443              :                           uint8 error_count,
    8444              :                           uint8 zeroed_count,
    8445              :                           uint8 checkfail_count,
    8446              :                           uint8 first_error_off,
    8447              :                           uint8 first_zeroed_off,
    8448              :                           uint8 first_ignored_off)
    8449              : {
    8450              : 
    8451          194 :     uint8       shift = 0;
    8452          194 :     uint8       zeroed_or_error_count =
    8453              :         error_count > 0 ? error_count : zeroed_count;
    8454              :     uint8       first_off;
    8455              : 
    8456              :     StaticAssertDecl(PG_IOV_MAX <= 1 << READV_COUNT_BITS,
    8457              :                      "PG_IOV_MAX is bigger than reserved space for error data");
    8458              :     StaticAssertDecl((1 + 1 + 3 * READV_COUNT_BITS) <= PGAIO_RESULT_ERROR_BITS,
    8459              :                      "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
    8460              : 
    8461              :     /*
    8462              :      * We only have space to encode one offset - but luckily that's good
    8463              :      * enough. If there is an error, the error is the interesting offset, same
    8464              :      * with a zeroed buffer vs an ignored buffer.
    8465              :      */
    8466          194 :     if (error_count > 0)
    8467           94 :         first_off = first_error_off;
    8468          100 :     else if (zeroed_count > 0)
    8469           82 :         first_off = first_zeroed_off;
    8470              :     else
    8471           18 :         first_off = first_ignored_off;
    8472              : 
    8473              :     Assert(!zeroed_any || error_count == 0);
    8474              : 
    8475          194 :     result->error_data = 0;
    8476              : 
    8477          194 :     result->error_data |= zeroed_any << shift;
    8478          194 :     shift += 1;
    8479              : 
    8480          194 :     result->error_data |= ignored_any << shift;
    8481          194 :     shift += 1;
    8482              : 
    8483          194 :     result->error_data |= ((uint32) zeroed_or_error_count) << shift;
    8484          194 :     shift += READV_COUNT_BITS;
    8485              : 
    8486          194 :     result->error_data |= ((uint32) checkfail_count) << shift;
    8487          194 :     shift += READV_COUNT_BITS;
    8488              : 
    8489          194 :     result->error_data |= ((uint32) first_off) << shift;
    8490          194 :     shift += READV_COUNT_BITS;
    8491              : 
    8492          194 :     result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
    8493              :         PGAIO_HCB_SHARED_BUFFER_READV;
    8494              : 
    8495          194 :     if (error_count > 0)
    8496           94 :         result->status = PGAIO_RS_ERROR;
    8497              :     else
    8498          100 :         result->status = PGAIO_RS_WARNING;
    8499              : 
    8500              :     /*
    8501              :      * The encoding is complicated enough to warrant cross-checking it against
    8502              :      * the decode function.
    8503              :      */
    8504              : #ifdef USE_ASSERT_CHECKING
    8505              :     {
    8506              :         bool        zeroed_any_2,
    8507              :                     ignored_any_2;
    8508              :         uint8       zeroed_or_error_count_2,
    8509              :                     checkfail_count_2,
    8510              :                     first_off_2;
    8511              : 
    8512              :         buffer_readv_decode_error(*result,
    8513              :                                   &zeroed_any_2, &ignored_any_2,
    8514              :                                   &zeroed_or_error_count_2,
    8515              :                                   &checkfail_count_2,
    8516              :                                   &first_off_2);
    8517              :         Assert(zeroed_any == zeroed_any_2);
    8518              :         Assert(ignored_any == ignored_any_2);
    8519              :         Assert(zeroed_or_error_count == zeroed_or_error_count_2);
    8520              :         Assert(checkfail_count == checkfail_count_2);
    8521              :         Assert(first_off == first_off_2);
    8522              :     }
    8523              : #endif
    8524              : 
    8525              : #undef READV_COUNT_BITS
    8526              : #undef READV_COUNT_MASK
    8527          194 : }
    8528              : 
    8529              : /*
    8530              :  * Helper for AIO readv completion callbacks, supporting both shared and temp
    8531              :  * buffers. Gets called once for each buffer in a multi-page read.
    8532              :  */
    8533              : static pg_attribute_always_inline void
    8534      1479247 : buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
    8535              :                           uint8 flags, bool failed, bool is_temp,
    8536              :                           bool *buffer_invalid,
    8537              :                           bool *failed_checksum,
    8538              :                           bool *ignored_checksum,
    8539              :                           bool *zeroed_buffer)
    8540              : {
    8541      1479247 :     BufferDesc *buf_hdr = is_temp ?
    8542        11059 :         GetLocalBufferDescriptor(-buffer - 1)
    8543      1479247 :         : GetBufferDescriptor(buffer - 1);
    8544      1479247 :     BufferTag   tag = buf_hdr->tag;
    8545      1479247 :     char       *bufdata = BufferGetBlock(buffer);
    8546              :     uint64      set_flag_bits;
    8547              :     int         piv_flags;
    8548              : 
    8549              :     /* check that the buffer is in the expected state for a read */
    8550              : #ifdef USE_ASSERT_CHECKING
    8551              :     {
    8552              :         uint64      buf_state = pg_atomic_read_u64(&buf_hdr->state);
    8553              : 
    8554              :         Assert(buf_state & BM_TAG_VALID);
    8555              :         Assert(!(buf_state & BM_VALID));
    8556              :         /* temp buffers don't use BM_IO_IN_PROGRESS */
    8557              :         if (!is_temp)
    8558              :             Assert(buf_state & BM_IO_IN_PROGRESS);
    8559              :         Assert(!(buf_state & BM_DIRTY));
    8560              :     }
    8561              : #endif
    8562              : 
    8563      1479247 :     *buffer_invalid = false;
    8564      1479247 :     *failed_checksum = false;
    8565      1479247 :     *ignored_checksum = false;
    8566      1479247 :     *zeroed_buffer = false;
    8567              : 
    8568              :     /*
    8569              :      * We ask PageIsVerified() to only log the message about checksum errors,
    8570              :      * as the completion might be run in any backend (or IO workers). We will
    8571              :      * report checksum errors in buffer_readv_report().
    8572              :      */
    8573      1479247 :     piv_flags = PIV_LOG_LOG;
    8574              : 
    8575              :     /* the local zero_damaged_pages may differ from the definer's */
    8576      1479247 :     if (flags & READ_BUFFERS_IGNORE_CHECKSUM_FAILURES)
    8577           38 :         piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
    8578              : 
    8579              :     /*
    8580              :      * If the buffers are marked for zero on error, we want to log that in
    8581              :      * case of a checksum failure.
    8582              :      */
    8583      1479247 :     if (flags & READ_BUFFERS_ZERO_ON_ERROR)
    8584        46539 :         piv_flags |= PIV_ZERO_BUFFERS_ON_ERROR;
    8585              : 
    8586              :     /* Check for garbage data. */
    8587      1479247 :     if (!failed)
    8588              :     {
    8589              :         /*
    8590              :          * If the buffer is not currently pinned by this backend, e.g. because
    8591              :          * we're completing this IO after an error, the buffer data will have
    8592              :          * been marked as inaccessible when the buffer was unpinned. The AIO
    8593              :          * subsystem holds a pin, but that doesn't prevent the buffer from
    8594              :          * having been marked as inaccessible. The completion might also be
    8595              :          * executed in a different process.
    8596              :          */
    8597              : #ifdef USE_VALGRIND
    8598              :         if (!BufferIsPinned(buffer))
    8599              :             VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
    8600              : #endif
    8601              : 
    8602      1478475 :         if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
    8603              :                             failed_checksum))
    8604              :         {
    8605           97 :             if (flags & READ_BUFFERS_ZERO_ON_ERROR)
    8606              :             {
    8607           47 :                 memset(bufdata, 0, BLCKSZ);
    8608           47 :                 *zeroed_buffer = true;
    8609              :             }
    8610              :             else
    8611              :             {
    8612           50 :                 *buffer_invalid = true;
    8613              :                 /* mark buffer as having failed */
    8614           50 :                 failed = true;
    8615              :             }
    8616              :         }
    8617      1478378 :         else if (*failed_checksum)
    8618           12 :             *ignored_checksum = true;
    8619              : 
    8620              :         /* undo what we did above */
    8621              : #ifdef USE_VALGRIND
    8622              :         if (!BufferIsPinned(buffer))
    8623              :             VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
    8624              : #endif
    8625              : 
    8626              :         /*
    8627              :          * Immediately log a message about the invalid page, but only to the
    8628              :          * server log. The reason to do so immediately is that this may be
    8629              :          * executed in a different backend than the one that originated the
    8630              :          * request. The reason to do so immediately is that the originator
    8631              :          * might not process the query result immediately (because it is busy
    8632              :          * doing another part of query processing) or at all (e.g. if it was
    8633              :          * cancelled or errored out due to another IO also failing). The
    8634              :          * definer of the IO will emit an ERROR or WARNING when processing the
    8635              :          * IO's results
    8636              :          *
    8637              :          * To avoid duplicating the code to emit these log messages, we reuse
    8638              :          * buffer_readv_report().
    8639              :          */
    8640      1478475 :         if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
    8641              :         {
    8642          109 :             PgAioResult result_one = {0};
    8643              : 
    8644          109 :             buffer_readv_encode_error(&result_one, is_temp,
    8645          109 :                                       *zeroed_buffer,
    8646          109 :                                       *ignored_checksum,
    8647          109 :                                       *buffer_invalid,
    8648          109 :                                       *zeroed_buffer ? 1 : 0,
    8649          109 :                                       *failed_checksum ? 1 : 0,
    8650              :                                       buf_off, buf_off, buf_off);
    8651          109 :             pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
    8652              :         }
    8653              :     }
    8654              : 
    8655              :     /* Terminate I/O and set BM_VALID. */
    8656      1479247 :     set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
    8657      1479247 :     if (is_temp)
    8658        11059 :         TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
    8659              :     else
    8660      1468188 :         TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
    8661              : 
    8662              :     /*
    8663              :      * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
    8664              :      * callback may not be executed in the same backend that called
    8665              :      * BUFFER_READ_START. The alternative would be to defer calling the
    8666              :      * tracepoint to a later point (e.g. the local completion callback for
    8667              :      * shared buffer reads), which seems even less helpful.
    8668              :      */
    8669              :     TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
    8670              :                                       tag.blockNum,
    8671              :                                       tag.spcOid,
    8672              :                                       tag.dbOid,
    8673              :                                       tag.relNumber,
    8674              :                                       is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
    8675              :                                       false);
    8676      1479247 : }
    8677              : 
    8678              : /*
    8679              :  * Perform completion handling of a single AIO read. This read may cover
    8680              :  * multiple blocks / buffers.
    8681              :  *
    8682              :  * Shared between shared and local buffers, to reduce code duplication.
    8683              :  */
    8684              : static pg_attribute_always_inline PgAioResult
    8685      1330166 : buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
    8686              :                       uint8 cb_data, bool is_temp)
    8687              : {
    8688      1330166 :     PgAioResult result = prior_result;
    8689      1330166 :     PgAioTargetData *td = pgaio_io_get_target_data(ioh);
    8690      1330166 :     uint8       first_error_off = 0;
    8691      1330166 :     uint8       first_zeroed_off = 0;
    8692      1330166 :     uint8       first_ignored_off = 0;
    8693      1330166 :     uint8       error_count = 0;
    8694      1330166 :     uint8       zeroed_count = 0;
    8695      1330166 :     uint8       ignored_count = 0;
    8696      1330166 :     uint8       checkfail_count = 0;
    8697              :     uint64     *io_data;
    8698              :     uint8       handle_data_len;
    8699              : 
    8700              :     if (is_temp)
    8701              :     {
    8702              :         Assert(td->smgr.is_temp);
    8703              :         Assert(pgaio_io_get_owner(ioh) == MyProcNumber);
    8704              :     }
    8705              :     else
    8706              :         Assert(!td->smgr.is_temp);
    8707              : 
    8708              :     /*
    8709              :      * Iterate over all the buffers affected by this IO and call the
    8710              :      * per-buffer completion function for each buffer.
    8711              :      */
    8712      1330166 :     io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
    8713      2809413 :     for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
    8714              :     {
    8715      1479247 :         Buffer      buf = io_data[buf_off];
    8716              :         bool        failed;
    8717      1479247 :         bool        failed_verification = false;
    8718      1479247 :         bool        failed_checksum = false;
    8719      1479247 :         bool        zeroed_buffer = false;
    8720      1479247 :         bool        ignored_checksum = false;
    8721              : 
    8722              :         Assert(BufferIsValid(buf));
    8723              : 
    8724              :         /*
    8725              :          * If the entire I/O failed on a lower-level, each buffer needs to be
    8726              :          * marked as failed. In case of a partial read, the first few buffers
    8727              :          * may be ok.
    8728              :          */
    8729      1479247 :         failed =
    8730      1479247 :             prior_result.status == PGAIO_RS_ERROR
    8731      1479247 :             || prior_result.result <= buf_off;
    8732              : 
    8733      1479247 :         buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
    8734              :                                   &failed_verification,
    8735              :                                   &failed_checksum,
    8736              :                                   &ignored_checksum,
    8737              :                                   &zeroed_buffer);
    8738              : 
    8739              :         /*
    8740              :          * Track information about the number of different kinds of error
    8741              :          * conditions across all pages, as there can be multiple pages failing
    8742              :          * verification as part of one IO.
    8743              :          */
    8744      1479247 :         if (failed_verification && !zeroed_buffer && error_count++ == 0)
    8745           44 :             first_error_off = buf_off;
    8746      1479247 :         if (zeroed_buffer && zeroed_count++ == 0)
    8747           35 :             first_zeroed_off = buf_off;
    8748      1479247 :         if (ignored_checksum && ignored_count++ == 0)
    8749           10 :             first_ignored_off = buf_off;
    8750      1479247 :         if (failed_checksum)
    8751           33 :             checkfail_count++;
    8752              :     }
    8753              : 
    8754              :     /*
    8755              :      * If the smgr read succeeded [partially] and page verification failed for
    8756              :      * some of the pages, adjust the IO's result state appropriately.
    8757              :      */
    8758      1330166 :     if (prior_result.status != PGAIO_RS_ERROR &&
    8759      1330111 :         (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
    8760              :     {
    8761           85 :         buffer_readv_encode_error(&result, is_temp,
    8762              :                                   zeroed_count > 0, ignored_count > 0,
    8763              :                                   error_count, zeroed_count, checkfail_count,
    8764              :                                   first_error_off, first_zeroed_off,
    8765              :                                   first_ignored_off);
    8766           85 :         pgaio_result_report(result, td, DEBUG1);
    8767              :     }
    8768              : 
    8769              :     /*
    8770              :      * For shared relations this reporting is done in
    8771              :      * shared_buffer_readv_complete_local().
    8772              :      */
    8773      1330166 :     if (is_temp && checkfail_count > 0)
    8774            2 :         pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
    8775              :                                               checkfail_count);
    8776              : 
    8777      1330166 :     return result;
    8778              : }
    8779              : 
    8780              : /*
    8781              :  * AIO error reporting callback for aio_shared_buffer_readv_cb and
    8782              :  * aio_local_buffer_readv_cb.
    8783              :  *
    8784              :  * The error is encoded / decoded in buffer_readv_encode_error() /
    8785              :  * buffer_readv_decode_error().
    8786              :  */
    8787              : static void
    8788          275 : buffer_readv_report(PgAioResult result, const PgAioTargetData *td,
    8789              :                     int elevel)
    8790              : {
    8791          275 :     int         nblocks = td->smgr.nblocks;
    8792          275 :     BlockNumber first = td->smgr.blockNum;
    8793          275 :     BlockNumber last = first + nblocks - 1;
    8794          275 :     ProcNumber  errProc =
    8795          275 :         td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER;
    8796              :     RelPathStr  rpath =
    8797          275 :         relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
    8798              :     bool        zeroed_any,
    8799              :                 ignored_any;
    8800              :     uint8       zeroed_or_error_count,
    8801              :                 checkfail_count,
    8802              :                 first_off;
    8803              :     uint8       affected_count;
    8804              :     const char *msg_one,
    8805              :                *msg_mult,
    8806              :                *det_mult,
    8807              :                *hint_mult;
    8808              : 
    8809          275 :     buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
    8810              :                               &zeroed_or_error_count,
    8811              :                               &checkfail_count,
    8812              :                               &first_off);
    8813              : 
    8814              :     /*
    8815              :      * Treat a read that had both zeroed buffers *and* ignored checksums as a
    8816              :      * special case, it's too irregular to be emitted the same way as the
    8817              :      * other cases.
    8818              :      */
    8819          275 :     if (zeroed_any && ignored_any)
    8820              :     {
    8821              :         Assert(zeroed_any && ignored_any);
    8822              :         Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
    8823              :         Assert(result.status != PGAIO_RS_ERROR);
    8824            4 :         affected_count = zeroed_or_error_count;
    8825              : 
    8826            4 :         ereport(elevel,
    8827              :                 errcode(ERRCODE_DATA_CORRUPTED),
    8828              :                 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
    8829              :                        affected_count, checkfail_count, first, last, rpath.str),
    8830              :                 affected_count > 1 ?
    8831              :                 errdetail("Block %u held the first zeroed page.",
    8832              :                           first + first_off) : 0,
    8833              :                 errhint_plural("See server log for details about the other %d invalid block.",
    8834              :                                "See server log for details about the other %d invalid blocks.",
    8835              :                                affected_count + checkfail_count - 1,
    8836              :                                affected_count + checkfail_count - 1));
    8837            4 :         return;
    8838              :     }
    8839              : 
    8840              :     /*
    8841              :      * The other messages are highly repetitive. To avoid duplicating a long
    8842              :      * and complicated ereport(), gather the translated format strings
    8843              :      * separately and then do one common ereport.
    8844              :      */
    8845          271 :     if (result.status == PGAIO_RS_ERROR)
    8846              :     {
    8847              :         Assert(!zeroed_any);    /* can't have invalid pages when zeroing them */
    8848          136 :         affected_count = zeroed_or_error_count;
    8849          136 :         msg_one = _("invalid page in block %u of relation \"%s\"");
    8850          136 :         msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
    8851          136 :         det_mult = _("Block %u held the first invalid page.");
    8852          136 :         hint_mult = _("See server log for the other %u invalid block(s).");
    8853              :     }
    8854          135 :     else if (zeroed_any && !ignored_any)
    8855              :     {
    8856          111 :         affected_count = zeroed_or_error_count;
    8857          111 :         msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
    8858          111 :         msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
    8859          111 :         det_mult = _("Block %u held the first zeroed page.");
    8860          111 :         hint_mult = _("See server log for the other %u zeroed block(s).");
    8861              :     }
    8862           24 :     else if (!zeroed_any && ignored_any)
    8863              :     {
    8864           24 :         affected_count = checkfail_count;
    8865           24 :         msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
    8866           24 :         msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
    8867           24 :         det_mult = _("Block %u held the first ignored page.");
    8868           24 :         hint_mult = _("See server log for the other %u ignored block(s).");
    8869              :     }
    8870              :     else
    8871            0 :         pg_unreachable();
    8872              : 
    8873          271 :     ereport(elevel,
    8874              :             errcode(ERRCODE_DATA_CORRUPTED),
    8875              :             affected_count == 1 ?
    8876              :             errmsg_internal(msg_one, first + first_off, rpath.str) :
    8877              :             errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
    8878              :             affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
    8879              :             affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
    8880              : }
    8881              : 
    8882              : static void
    8883      1462084 : shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
    8884              : {
    8885      1462084 :     buffer_stage_common(ioh, false, false);
    8886      1462084 : }
    8887              : 
    8888              : static PgAioResult
    8889      1327323 : shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
    8890              :                              uint8 cb_data)
    8891              : {
    8892      1327323 :     return buffer_readv_complete(ioh, prior_result, cb_data, false);
    8893              : }
    8894              : 
    8895              : /*
    8896              :  * We need a backend-local completion callback for shared buffers, to be able
    8897              :  * to report checksum errors correctly. Unfortunately that can only safely
    8898              :  * happen if the reporting backend has previously called
    8899              :  * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
    8900              :  * the backend that started the IO. Hence this callback.
    8901              :  */
    8902              : static PgAioResult
    8903      1462084 : shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result,
    8904              :                                    uint8 cb_data)
    8905              : {
    8906              :     bool        zeroed_any,
    8907              :                 ignored_any;
    8908              :     uint8       zeroed_or_error_count,
    8909              :                 checkfail_count,
    8910              :                 first_off;
    8911              : 
    8912      1462084 :     if (prior_result.status == PGAIO_RS_OK)
    8913      1461905 :         return prior_result;
    8914              : 
    8915          179 :     buffer_readv_decode_error(prior_result,
    8916              :                               &zeroed_any,
    8917              :                               &ignored_any,
    8918              :                               &zeroed_or_error_count,
    8919              :                               &checkfail_count,
    8920              :                               &first_off);
    8921              : 
    8922          179 :     if (checkfail_count)
    8923              :     {
    8924           25 :         PgAioTargetData *td = pgaio_io_get_target_data(ioh);
    8925              : 
    8926           25 :         pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
    8927              :                                               checkfail_count);
    8928              :     }
    8929              : 
    8930          179 :     return prior_result;
    8931              : }
    8932              : 
    8933              : static void
    8934         2843 : local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
    8935              : {
    8936         2843 :     buffer_stage_common(ioh, false, true);
    8937         2843 : }
    8938              : 
    8939              : static PgAioResult
    8940         2843 : local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
    8941              :                             uint8 cb_data)
    8942              : {
    8943         2843 :     return buffer_readv_complete(ioh, prior_result, cb_data, true);
    8944              : }
    8945              : 
    8946              : /* readv callback is passed READ_BUFFERS_* flags as callback data */
    8947              : const PgAioHandleCallbacks aio_shared_buffer_readv_cb = {
    8948              :     .stage = shared_buffer_readv_stage,
    8949              :     .complete_shared = shared_buffer_readv_complete,
    8950              :     /* need a local callback to report checksum failures */
    8951              :     .complete_local = shared_buffer_readv_complete_local,
    8952              :     .report = buffer_readv_report,
    8953              : };
    8954              : 
    8955              : /* readv callback is passed READ_BUFFERS_* flags as callback data */
    8956              : const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
    8957              :     .stage = local_buffer_readv_stage,
    8958              : 
    8959              :     /*
    8960              :      * Note that this, in contrast to the shared_buffers case, uses
    8961              :      * complete_local, as only the issuing backend has access to the required
    8962              :      * datastructures. This is important in case the IO completion may be
    8963              :      * consumed incidentally by another backend.
    8964              :      */
    8965              :     .complete_local = local_buffer_readv_complete,
    8966              :     .report = buffer_readv_report,
    8967              : };
        

Generated by: LCOV version 2.0-1