LCOV - code coverage report
Current view: top level - src/backend/access/transam - slru.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 81.6 % 534 436
Test Date: 2026-03-24 01:16:09 Functions: 100.0 % 30 30
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * slru.c
       4              :  *      Simple LRU buffering for wrap-around-able permanent metadata
       5              :  *
       6              :  * This module is used to maintain various pieces of transaction status
       7              :  * indexed by TransactionId (such as commit status, parent transaction ID,
       8              :  * commit timestamp), as well as storage for multixacts, serializable
       9              :  * isolation locks and NOTIFY traffic.  Extensions can define their own
      10              :  * SLRUs, too.
      11              :  *
      12              :  * Under ordinary circumstances we expect that write traffic will occur
      13              :  * mostly to the latest page (and to the just-prior page, soon after a
      14              :  * page transition).  Read traffic will probably touch a larger span of
      15              :  * pages, but a relatively small number of buffers should be sufficient.
      16              :  *
      17              :  * We use a simple least-recently-used scheme to manage a pool of shared
      18              :  * page buffers, split in banks by the lowest bits of the page number, and
      19              :  * the management algorithm only processes the bank to which the desired
      20              :  * page belongs, so a linear search is sufficient; there's no need for a
      21              :  * hashtable or anything fancy.  The algorithm is straight LRU except that
      22              :  * we will never swap out the latest page (since we know it's going to be
      23              :  * hit again eventually).
      24              :  *
      25              :  * We use per-bank control LWLocks to protect the shared data structures,
      26              :  * plus per-buffer LWLocks that synchronize I/O for each buffer.  The
      27              :  * bank's control lock must be held to examine or modify any of the bank's
      28              :  * shared state.  A process that is reading in or writing out a page
      29              :  * buffer does not hold the control lock, only the per-buffer lock for the
      30              :  * buffer it is working on.  One exception is latest_page_number, which is
      31              :  * read and written using atomic ops.
      32              :  *
      33              :  * "Holding the bank control lock" means exclusive lock in all cases
      34              :  * except for SimpleLruReadPage_ReadOnly(); see comments for
      35              :  * SlruRecentlyUsed() for the implications of that.
      36              :  *
      37              :  * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
      38              :  * before releasing the control lock.  The per-buffer lock is released after
      39              :  * completing the I/O, re-acquiring the control lock, and updating the shared
      40              :  * state.  (Deadlock is not possible here, because we never try to initiate
      41              :  * I/O when someone else is already doing I/O on the same buffer.)
      42              :  * To wait for I/O to complete, release the control lock, acquire the
      43              :  * per-buffer lock in shared mode, immediately release the per-buffer lock,
      44              :  * reacquire the control lock, and then recheck state (since arbitrary things
      45              :  * could have happened while we didn't have the lock).
      46              :  *
      47              :  * As with the regular buffer manager, it is possible for another process
      48              :  * to re-dirty a page that is currently being written out.  This is handled
      49              :  * by re-setting the page's page_dirty flag.
      50              :  *
      51              :  *
      52              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      53              :  * Portions Copyright (c) 1994, Regents of the University of California
      54              :  *
      55              :  * src/backend/access/transam/slru.c
      56              :  *
      57              :  *-------------------------------------------------------------------------
      58              :  */
      59              : #include "postgres.h"
      60              : 
      61              : #include <fcntl.h>
      62              : #include <sys/stat.h>
      63              : #include <unistd.h>
      64              : 
      65              : #include "access/slru.h"
      66              : #include "access/transam.h"
      67              : #include "access/xlog.h"
      68              : #include "access/xlogutils.h"
      69              : #include "miscadmin.h"
      70              : #include "pgstat.h"
      71              : #include "storage/fd.h"
      72              : #include "storage/shmem.h"
      73              : #include "utils/guc.h"
      74              : #include "utils/wait_event.h"
      75              : 
      76              : /*
      77              :  * Converts segment number to the filename of the segment.
      78              :  *
      79              :  * "path" should point to a buffer at least MAXPGPATH characters long.
      80              :  *
      81              :  * If ctl->long_segment_names is true, segno can be in the range [0, 2^60-1].
      82              :  * The resulting file name is made of 15 characters, e.g. dir/123456789ABCDEF.
      83              :  *
      84              :  * If ctl->long_segment_names is false, segno can be in the range [0, 2^24-1].
      85              :  * The resulting file name is made of 4 to 6 characters, as of:
      86              :  *
      87              :  *  dir/1234   for [0, 2^16-1]
      88              :  *  dir/12345  for [2^16, 2^20-1]
      89              :  *  dir/123456 for [2^20, 2^24-1]
      90              :  */
      91              : static inline int
      92      7506023 : SlruFileName(SlruCtl ctl, char *path, int64 segno)
      93              : {
      94      7506023 :     if (ctl->long_segment_names)
      95              :     {
      96              :         /*
      97              :          * We could use 16 characters here but the disadvantage would be that
      98              :          * the SLRU segments will be hard to distinguish from WAL segments.
      99              :          *
     100              :          * For this reason we use 15 characters. It is enough but also means
     101              :          * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
     102              :          */
     103              :         Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
     104        16616 :         return snprintf(path, MAXPGPATH, "%s/%015" PRIX64, ctl->Dir, segno);
     105              :     }
     106              :     else
     107              :     {
     108              :         /*
     109              :          * Despite the fact that %04X format string is used up to 24 bit
     110              :          * integers are allowed. See SlruCorrectSegmentFilenameLength()
     111              :          */
     112              :         Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
     113      7489407 :         return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir,
     114              :                         (unsigned int) segno);
     115              :     }
     116              : }
     117              : 
     118              : /*
     119              :  * During SimpleLruWriteAll(), we will usually not need to write more than one
     120              :  * or two physical files, but we may need to write several pages per file.  We
     121              :  * can consolidate the I/O requests by leaving files open until control returns
     122              :  * to SimpleLruWriteAll().  This data structure remembers which files are open.
     123              :  */
     124              : #define MAX_WRITEALL_BUFFERS    16
     125              : 
     126              : typedef struct SlruWriteAllData
     127              : {
     128              :     int         num_files;      /* # files actually open */
     129              :     int         fd[MAX_WRITEALL_BUFFERS];   /* their FD's */
     130              :     int64       segno[MAX_WRITEALL_BUFFERS];    /* their log seg#s */
     131              : } SlruWriteAllData;
     132              : 
     133              : typedef struct SlruWriteAllData *SlruWriteAll;
     134              : 
     135              : 
     136              : /*
     137              :  * Bank size for the slot array.  Pages are assigned a bank according to their
     138              :  * page number, with each bank being this size.  We want a power of 2 so that
     139              :  * we can determine the bank number for a page with just bit shifting; we also
     140              :  * want to keep the bank size small so that LRU victim search is fast.  16
     141              :  * buffers per bank seems a good number.
     142              :  */
     143              : #define SLRU_BANK_BITSHIFT      4
     144              : #define SLRU_BANK_SIZE          (1 << SLRU_BANK_BITSHIFT)
     145              : 
     146              : /*
     147              :  * Macro to get the bank number to which the slot belongs.
     148              :  */
     149              : #define SlotGetBankNumber(slotno)   ((slotno) >> SLRU_BANK_BITSHIFT)
     150              : 
     151              : 
     152              : /*
     153              :  * Populate a file tag describing a segment file.  We only use the segment
     154              :  * number, since we can derive everything else we need by having separate
     155              :  * sync handler functions for clog, multixact etc.
     156              :  */
     157              : #define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
     158              : ( \
     159              :     memset(&(a), 0, sizeof(FileTag)), \
     160              :     (a).handler = (xx_handler), \
     161              :     (a).segno = (xx_segno) \
     162              : )
     163              : 
     164              : /* Saved info for SlruReportIOError */
     165              : typedef enum
     166              : {
     167              :     SLRU_OPEN_FAILED,
     168              :     SLRU_SEEK_FAILED,
     169              :     SLRU_READ_FAILED,
     170              :     SLRU_WRITE_FAILED,
     171              :     SLRU_FSYNC_FAILED,
     172              :     SLRU_CLOSE_FAILED,
     173              : } SlruErrorCause;
     174              : 
     175              : static SlruErrorCause slru_errcause;
     176              : static int  slru_errno;
     177              : 
     178              : 
     179              : static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
     180              : static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
     181              : static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
     182              : static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
     183              : static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno,
     184              :                                   SlruWriteAll fdata);
     185              : static void SlruReportIOError(SlruCtl ctl, int64 pageno,
     186              :                               const void *opaque_data);
     187              : static int  SlruSelectLRUPage(SlruCtl ctl, int64 pageno);
     188              : 
     189              : static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
     190              :                                       int64 segpage, void *data);
     191              : static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
     192              : static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
     193              : 
     194              : 
     195              : /*
     196              :  * Initialization of shared memory
     197              :  */
     198              : 
     199              : Size
     200        23717 : SimpleLruShmemSize(int nslots, int nlsns)
     201              : {
     202        23717 :     int         nbanks = nslots / SLRU_BANK_SIZE;
     203              :     Size        sz;
     204              : 
     205              :     Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
     206              :     Assert(nslots % SLRU_BANK_SIZE == 0);
     207              : 
     208              :     /* we assume nslots isn't so large as to risk overflow */
     209        23717 :     sz = MAXALIGN(sizeof(SlruSharedData));
     210        23717 :     sz += MAXALIGN(nslots * sizeof(char *));    /* page_buffer[] */
     211        23717 :     sz += MAXALIGN(nslots * sizeof(SlruPageStatus));    /* page_status[] */
     212        23717 :     sz += MAXALIGN(nslots * sizeof(bool));  /* page_dirty[] */
     213        23717 :     sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
     214        23717 :     sz += MAXALIGN(nslots * sizeof(int));   /* page_lru_count[] */
     215        23717 :     sz += MAXALIGN(nslots * sizeof(LWLockPadded));  /* buffer_locks[] */
     216        23717 :     sz += MAXALIGN(nbanks * sizeof(LWLockPadded));  /* bank_locks[] */
     217        23717 :     sz += MAXALIGN(nbanks * sizeof(int));   /* bank_cur_lru_count[] */
     218              : 
     219        23717 :     if (nlsns > 0)
     220         3387 :         sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));    /* group_lsn[] */
     221              : 
     222        23717 :     return BUFFERALIGN(sz) + BLCKSZ * nslots;
     223              : }
     224              : 
     225              : /*
     226              :  * Determine a number of SLRU buffers to use.
     227              :  *
     228              :  * We simply divide shared_buffers by the divisor given and cap
     229              :  * that at the maximum given; but always at least SLRU_BANK_SIZE.
     230              :  * Round down to the nearest multiple of SLRU_BANK_SIZE.
     231              :  */
     232              : int
     233        10110 : SimpleLruAutotuneBuffers(int divisor, int max)
     234              : {
     235        10110 :     return Min(max - (max % SLRU_BANK_SIZE),
     236              :                Max(SLRU_BANK_SIZE,
     237              :                    NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE));
     238              : }
     239              : 
     240              : /*
     241              :  * Initialize, or attach to, a simple LRU cache in shared memory.
     242              :  *
     243              :  * ctl: address of local (unshared) control structure.
     244              :  * name: name of SLRU.  (This is user-visible, pick with care!)
     245              :  * nslots: number of page slots to use.
     246              :  * nlsns: number of LSN groups per page (set to zero if not relevant).
     247              :  * subdir: PGDATA-relative subdirectory that will contain the files.
     248              :  * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks.
     249              :  * bank_tranche_id: tranche ID to use for the bank LWLocks.
     250              :  * sync_handler: which set of functions to use to handle sync requests
     251              :  * long_segment_names: use short or long segment names
     252              :  */
     253              : void
     254         8264 : SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
     255              :               const char *subdir, int buffer_tranche_id, int bank_tranche_id,
     256              :               SyncRequestHandler sync_handler, bool long_segment_names)
     257              : {
     258              :     SlruShared  shared;
     259              :     bool        found;
     260         8264 :     int         nbanks = nslots / SLRU_BANK_SIZE;
     261              : 
     262              :     Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
     263              : 
     264              :     Assert(ctl->PagePrecedes != NULL);
     265              :     Assert(ctl->errdetail_for_io_error != NULL);
     266              : 
     267         8264 :     shared = (SlruShared) ShmemInitStruct(name,
     268              :                                           SimpleLruShmemSize(nslots, nlsns),
     269              :                                           &found);
     270              : 
     271         8264 :     if (!IsUnderPostmaster)
     272              :     {
     273              :         /* Initialize locks and shared memory area */
     274              :         char       *ptr;
     275              :         Size        offset;
     276              : 
     277              :         Assert(!found);
     278              : 
     279         8264 :         memset(shared, 0, sizeof(SlruSharedData));
     280              : 
     281         8264 :         shared->num_slots = nslots;
     282         8264 :         shared->lsn_groups_per_page = nlsns;
     283              : 
     284         8264 :         pg_atomic_init_u64(&shared->latest_page_number, 0);
     285              : 
     286         8264 :         shared->slru_stats_idx = pgstat_get_slru_index(name);
     287              : 
     288         8264 :         ptr = (char *) shared;
     289         8264 :         offset = MAXALIGN(sizeof(SlruSharedData));
     290         8264 :         shared->page_buffer = (char **) (ptr + offset);
     291         8264 :         offset += MAXALIGN(nslots * sizeof(char *));
     292         8264 :         shared->page_status = (SlruPageStatus *) (ptr + offset);
     293         8264 :         offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
     294         8264 :         shared->page_dirty = (bool *) (ptr + offset);
     295         8264 :         offset += MAXALIGN(nslots * sizeof(bool));
     296         8264 :         shared->page_number = (int64 *) (ptr + offset);
     297         8264 :         offset += MAXALIGN(nslots * sizeof(int64));
     298         8264 :         shared->page_lru_count = (int *) (ptr + offset);
     299         8264 :         offset += MAXALIGN(nslots * sizeof(int));
     300              : 
     301              :         /* Initialize LWLocks */
     302         8264 :         shared->buffer_locks = (LWLockPadded *) (ptr + offset);
     303         8264 :         offset += MAXALIGN(nslots * sizeof(LWLockPadded));
     304         8264 :         shared->bank_locks = (LWLockPadded *) (ptr + offset);
     305         8264 :         offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
     306         8264 :         shared->bank_cur_lru_count = (int *) (ptr + offset);
     307         8264 :         offset += MAXALIGN(nbanks * sizeof(int));
     308              : 
     309         8264 :         if (nlsns > 0)
     310              :         {
     311         1180 :             shared->group_lsn = (XLogRecPtr *) (ptr + offset);
     312         1180 :             offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
     313              :         }
     314              : 
     315         8264 :         ptr += BUFFERALIGN(offset);
     316       210904 :         for (int slotno = 0; slotno < nslots; slotno++)
     317              :         {
     318       202640 :             LWLockInitialize(&shared->buffer_locks[slotno].lock,
     319              :                              buffer_tranche_id);
     320              : 
     321       202640 :             shared->page_buffer[slotno] = ptr;
     322       202640 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
     323       202640 :             shared->page_dirty[slotno] = false;
     324       202640 :             shared->page_lru_count[slotno] = 0;
     325       202640 :             ptr += BLCKSZ;
     326              :         }
     327              : 
     328              :         /* Initialize the slot banks. */
     329        20929 :         for (int bankno = 0; bankno < nbanks; bankno++)
     330              :         {
     331        12665 :             LWLockInitialize(&shared->bank_locks[bankno].lock, bank_tranche_id);
     332        12665 :             shared->bank_cur_lru_count[bankno] = 0;
     333              :         }
     334              : 
     335              :         /* Should fit to estimated shmem size */
     336              :         Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
     337              :     }
     338              :     else
     339              :     {
     340              :         Assert(found);
     341              :         Assert(shared->num_slots == nslots);
     342              :     }
     343              : 
     344              :     /*
     345              :      * Initialize the unshared control struct, including directory path. We
     346              :      * assume caller set PagePrecedes.
     347              :      */
     348         8264 :     ctl->shared = shared;
     349         8264 :     ctl->sync_handler = sync_handler;
     350         8264 :     ctl->long_segment_names = long_segment_names;
     351         8264 :     ctl->nbanks = nbanks;
     352         8264 :     strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
     353         8264 : }
     354              : 
     355              : /*
     356              :  * Helper function for GUC check_hook to check whether slru buffers are in
     357              :  * multiples of SLRU_BANK_SIZE.
     358              :  */
     359              : bool
     360        12098 : check_slru_buffers(const char *name, int *newval)
     361              : {
     362              :     /* Valid values are multiples of SLRU_BANK_SIZE */
     363        12098 :     if (*newval % SLRU_BANK_SIZE == 0)
     364        12098 :         return true;
     365              : 
     366            0 :     GUC_check_errdetail("\"%s\" must be a multiple of %d.", name,
     367              :                         SLRU_BANK_SIZE);
     368            0 :     return false;
     369              : }
     370              : 
     371              : /*
     372              :  * Initialize (or reinitialize) a page to zeroes.
     373              :  *
     374              :  * The page is not actually written, just set up in shared memory.
     375              :  * The slot number of the new page is returned.
     376              :  *
     377              :  * Bank lock must be held at entry, and will be held at exit.
     378              :  */
     379              : int
     380      7345894 : SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
     381              : {
     382      7345894 :     SlruShared  shared = ctl->shared;
     383              :     int         slotno;
     384              : 
     385              :     Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
     386              : 
     387              :     /* Find a suitable buffer slot for the page */
     388      7345894 :     slotno = SlruSelectLRUPage(ctl, pageno);
     389              :     Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
     390              :            (shared->page_status[slotno] == SLRU_PAGE_VALID &&
     391              :             !shared->page_dirty[slotno]) ||
     392              :            shared->page_number[slotno] == pageno);
     393              : 
     394              :     /* Mark the slot as containing this page */
     395      7345894 :     shared->page_number[slotno] = pageno;
     396      7345894 :     shared->page_status[slotno] = SLRU_PAGE_VALID;
     397      7345894 :     shared->page_dirty[slotno] = true;
     398      7345894 :     SlruRecentlyUsed(shared, slotno);
     399              : 
     400              :     /* Set the buffer to zeroes */
     401      7345894 :     MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
     402              : 
     403              :     /* Set the LSNs for this new page to zero */
     404      7345894 :     SimpleLruZeroLSNs(ctl, slotno);
     405              : 
     406              :     /*
     407              :      * Assume this page is now the latest active page.
     408              :      *
     409              :      * Note that because both this routine and SlruSelectLRUPage run with a
     410              :      * SLRU bank lock held, it is not possible for this to be zeroing a page
     411              :      * that SlruSelectLRUPage is going to evict simultaneously.  Therefore,
     412              :      * there's no memory barrier here.
     413              :      */
     414      7345894 :     pg_atomic_write_u64(&shared->latest_page_number, pageno);
     415              : 
     416              :     /* update the stats counter of zeroed pages */
     417      7345894 :     pgstat_count_slru_blocks_zeroed(shared->slru_stats_idx);
     418              : 
     419      7345894 :     return slotno;
     420              : }
     421              : 
     422              : /*
     423              :  * Zero all the LSNs we store for this slru page.
     424              :  *
     425              :  * This should be called each time we create a new page, and each time we read
     426              :  * in a page from disk into an existing buffer.  (Such an old page cannot
     427              :  * have any interesting LSNs, since we'd have flushed them before writing
     428              :  * the page in the first place.)
     429              :  *
     430              :  * This assumes that InvalidXLogRecPtr is bitwise-all-0.
     431              :  */
     432              : static void
     433      7363490 : SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
     434              : {
     435      7363490 :     SlruShared  shared = ctl->shared;
     436              : 
     437      7363490 :     if (shared->lsn_groups_per_page > 0)
     438       433260 :         MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
     439              :                shared->lsn_groups_per_page * sizeof(XLogRecPtr));
     440      7363490 : }
     441              : 
     442              : /*
     443              :  * This is a convenience wrapper for the common case of zeroing a page and
     444              :  * immediately flushing it to disk.
     445              :  *
     446              :  * SLRU bank lock is acquired and released here.
     447              :  */
     448              : void
     449          218 : SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno)
     450              : {
     451              :     int         slotno;
     452              :     LWLock     *lock;
     453              : 
     454          218 :     lock = SimpleLruGetBankLock(ctl, pageno);
     455          218 :     LWLockAcquire(lock, LW_EXCLUSIVE);
     456              : 
     457              :     /* Create and zero the page */
     458          218 :     slotno = SimpleLruZeroPage(ctl, pageno);
     459              : 
     460              :     /* Make sure it's written out */
     461          218 :     SimpleLruWritePage(ctl, slotno);
     462              :     Assert(!ctl->shared->page_dirty[slotno]);
     463              : 
     464          218 :     LWLockRelease(lock);
     465          218 : }
     466              : 
     467              : /*
     468              :  * Wait for any active I/O on a page slot to finish.  (This does not
     469              :  * guarantee that new I/O hasn't been started before we return, though.
     470              :  * In fact the slot might not even contain the same page anymore.)
     471              :  *
     472              :  * Bank lock must be held at entry, and will be held at exit.
     473              :  */
     474              : static void
     475            2 : SimpleLruWaitIO(SlruCtl ctl, int slotno)
     476              : {
     477            2 :     SlruShared  shared = ctl->shared;
     478            2 :     int         bankno = SlotGetBankNumber(slotno);
     479              : 
     480              :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     481              : 
     482              :     /* See notes at top of file */
     483            2 :     LWLockRelease(&shared->bank_locks[bankno].lock);
     484            2 :     LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
     485            2 :     LWLockRelease(&shared->buffer_locks[slotno].lock);
     486            2 :     LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
     487              : 
     488              :     /*
     489              :      * If the slot is still in an io-in-progress state, then either someone
     490              :      * already started a new I/O on the slot, or a previous I/O failed and
     491              :      * neglected to reset the page state.  That shouldn't happen, really, but
     492              :      * it seems worth a few extra cycles to check and recover from it. We can
     493              :      * cheaply test for failure by seeing if the buffer lock is still held (we
     494              :      * assume that transaction abort would release the lock).
     495              :      */
     496            2 :     if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
     497            2 :         shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
     498              :     {
     499            0 :         if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
     500              :         {
     501              :             /* indeed, the I/O must have failed */
     502            0 :             if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
     503            0 :                 shared->page_status[slotno] = SLRU_PAGE_EMPTY;
     504              :             else                /* write_in_progress */
     505              :             {
     506            0 :                 shared->page_status[slotno] = SLRU_PAGE_VALID;
     507            0 :                 shared->page_dirty[slotno] = true;
     508              :             }
     509            0 :             LWLockRelease(&shared->buffer_locks[slotno].lock);
     510              :         }
     511              :     }
     512            2 : }
     513              : 
     514              : /*
     515              :  * Find a page in a shared buffer, reading it in if necessary.
     516              :  * The page number must correspond to an already-initialized page.
     517              :  *
     518              :  * If write_ok is true then it is OK to return a page that is in
     519              :  * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
     520              :  * that modification of the page is safe.  If write_ok is false then we
     521              :  * will not return the page until it is not undergoing active I/O.
     522              :  *
     523              :  * On error, the passed-in 'opaque_data' is passed to the
     524              :  * 'errdetail_for_io_error' callback, to provide details on the operation that
     525              :  * failed.  It is only used for error reporting.
     526              :  *
     527              :  * Return value is the shared-buffer slot number now holding the page.
     528              :  * The buffer's LRU access info is updated.
     529              :  *
     530              :  * The correct bank lock must be held at entry, and will be held at exit.
     531              :  */
     532              : int
     533       388906 : SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
     534              :                   const void *opaque_data)
     535              : {
     536       388906 :     SlruShared  shared = ctl->shared;
     537       388906 :     LWLock     *banklock = SimpleLruGetBankLock(ctl, pageno);
     538              : 
     539              :     Assert(LWLockHeldByMeInMode(banklock, LW_EXCLUSIVE));
     540              : 
     541              :     /* Outer loop handles restart if we must wait for someone else's I/O */
     542              :     for (;;)
     543            2 :     {
     544              :         int         slotno;
     545              :         bool        ok;
     546              : 
     547              :         /* See if page already is in memory; if not, pick victim slot */
     548       388908 :         slotno = SlruSelectLRUPage(ctl, pageno);
     549              : 
     550              :         /* Did we find the page in memory? */
     551       388908 :         if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
     552       387816 :             shared->page_number[slotno] == pageno)
     553              :         {
     554              :             /*
     555              :              * If page is still being read in, we must wait for I/O.  Likewise
     556              :              * if the page is being written and the caller said that's not OK.
     557              :              */
     558       371312 :             if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
     559       371312 :                 (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
     560            3 :                  !write_ok))
     561              :             {
     562            2 :                 SimpleLruWaitIO(ctl, slotno);
     563              :                 /* Now we must recheck state from the top */
     564            2 :                 continue;
     565              :             }
     566              :             /* Otherwise, it's ready to use */
     567       371310 :             SlruRecentlyUsed(shared, slotno);
     568              : 
     569              :             /* update the stats counter of pages found in the SLRU */
     570       371310 :             pgstat_count_slru_blocks_hit(shared->slru_stats_idx);
     571              : 
     572       371310 :             return slotno;
     573              :         }
     574              : 
     575              :         /* We found no match; assert we selected a freeable slot */
     576              :         Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
     577              :                (shared->page_status[slotno] == SLRU_PAGE_VALID &&
     578              :                 !shared->page_dirty[slotno]));
     579              : 
     580              :         /* Mark the slot read-busy */
     581        17596 :         shared->page_number[slotno] = pageno;
     582        17596 :         shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
     583        17596 :         shared->page_dirty[slotno] = false;
     584              : 
     585              :         /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
     586        17596 :         LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
     587              : 
     588              :         /* Release bank lock while doing I/O */
     589        17596 :         LWLockRelease(banklock);
     590              : 
     591              :         /* Do the read */
     592        17596 :         ok = SlruPhysicalReadPage(ctl, pageno, slotno);
     593              : 
     594              :         /* Set the LSNs for this newly read-in page to zero */
     595        17596 :         SimpleLruZeroLSNs(ctl, slotno);
     596              : 
     597              :         /* Re-acquire bank control lock and update page state */
     598        17596 :         LWLockAcquire(banklock, LW_EXCLUSIVE);
     599              : 
     600              :         Assert(shared->page_number[slotno] == pageno &&
     601              :                shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
     602              :                !shared->page_dirty[slotno]);
     603              : 
     604        17596 :         shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
     605              : 
     606        17596 :         LWLockRelease(&shared->buffer_locks[slotno].lock);
     607              : 
     608              :         /* Now it's okay to ereport if we failed */
     609        17596 :         if (!ok)
     610            1 :             SlruReportIOError(ctl, pageno, opaque_data);
     611              : 
     612        17595 :         SlruRecentlyUsed(shared, slotno);
     613              : 
     614              :         /* update the stats counter of pages not found in SLRU */
     615        17595 :         pgstat_count_slru_blocks_read(shared->slru_stats_idx);
     616              : 
     617        17595 :         return slotno;
     618              :     }
     619              : }
     620              : 
     621              : /*
     622              :  * Find a page in a shared buffer, reading it in if necessary.
     623              :  * The page number must correspond to an already-initialized page.
     624              :  * The caller must intend only read-only access to the page.
     625              :  *
     626              :  * On error, the passed-in 'opaque_data' is passed to the
     627              :  * 'errdetail_for_io_error' callback, to provide details on the operation that
     628              :  * failed.  It is only used for error reporting.
     629              :  *
     630              :  * Return value is the shared-buffer slot number now holding the page.
     631              :  * The buffer's LRU access info is updated.
     632              :  *
     633              :  * Bank control lock must NOT be held at entry, but will be held at exit.
     634              :  * It is unspecified whether the lock will be shared or exclusive.
     635              :  */
     636              : int
     637       869510 : SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, const void *opaque_data)
     638              : {
     639       869510 :     SlruShared  shared = ctl->shared;
     640       869510 :     LWLock     *banklock = SimpleLruGetBankLock(ctl, pageno);
     641       869510 :     int         bankno = pageno % ctl->nbanks;
     642       869510 :     int         bankstart = bankno * SLRU_BANK_SIZE;
     643       869510 :     int         bankend = bankstart + SLRU_BANK_SIZE;
     644              : 
     645              :     /* Try to find the page while holding only shared lock */
     646       869510 :     LWLockAcquire(banklock, LW_SHARED);
     647              : 
     648              :     /* See if page is already in a buffer */
     649       876218 :     for (int slotno = bankstart; slotno < bankend; slotno++)
     650              :     {
     651       875996 :         if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
     652       875022 :             shared->page_number[slotno] == pageno &&
     653       869288 :             shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
     654              :         {
     655              :             /* See comments for SlruRecentlyUsed() */
     656       869288 :             SlruRecentlyUsed(shared, slotno);
     657              : 
     658              :             /* update the stats counter of pages found in the SLRU */
     659       869288 :             pgstat_count_slru_blocks_hit(shared->slru_stats_idx);
     660              : 
     661       869288 :             return slotno;
     662              :         }
     663              :     }
     664              : 
     665              :     /* No luck, so switch to normal exclusive lock and do regular read */
     666          222 :     LWLockRelease(banklock);
     667          222 :     LWLockAcquire(banklock, LW_EXCLUSIVE);
     668              : 
     669          222 :     return SimpleLruReadPage(ctl, pageno, true, opaque_data);
     670              : }
     671              : 
     672              : /*
     673              :  * Write a page from a shared buffer, if necessary.
     674              :  * Does nothing if the specified slot is not dirty.
     675              :  *
     676              :  * NOTE: only one write attempt is made here.  Hence, it is possible that
     677              :  * the page is still dirty at exit (if someone else re-dirtied it during
     678              :  * the write).  However, we *do* attempt a fresh write even if the page
     679              :  * is already being written; this is for checkpoints.
     680              :  *
     681              :  * Bank lock must be held at entry, and will be held at exit.
     682              :  */
     683              : static void
     684      7349916 : SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
     685              : {
     686      7349916 :     SlruShared  shared = ctl->shared;
     687      7349916 :     int64       pageno = shared->page_number[slotno];
     688      7349916 :     int         bankno = SlotGetBankNumber(slotno);
     689              :     bool        ok;
     690              : 
     691              :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     692              :     Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
     693              : 
     694              :     /* If a write is in progress, wait for it to finish */
     695      7349916 :     while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
     696            0 :            shared->page_number[slotno] == pageno)
     697              :     {
     698            0 :         SimpleLruWaitIO(ctl, slotno);
     699              :     }
     700              : 
     701              :     /*
     702              :      * Do nothing if page is not dirty, or if buffer no longer contains the
     703              :      * same page we were called for.
     704              :      */
     705      7349916 :     if (!shared->page_dirty[slotno] ||
     706      7346594 :         shared->page_status[slotno] != SLRU_PAGE_VALID ||
     707      7346594 :         shared->page_number[slotno] != pageno)
     708         3322 :         return;
     709              : 
     710              :     /*
     711              :      * Mark the slot write-busy, and clear the dirtybit.  After this point, a
     712              :      * transaction status update on this page will mark it dirty again.
     713              :      */
     714      7346594 :     shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
     715      7346594 :     shared->page_dirty[slotno] = false;
     716              : 
     717              :     /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
     718      7346594 :     LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
     719              : 
     720              :     /* Release bank lock while doing I/O */
     721      7346594 :     LWLockRelease(&shared->bank_locks[bankno].lock);
     722              : 
     723              :     /* Do the write */
     724      7346594 :     ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
     725              : 
     726              :     /* If we failed, and we're in a flush, better close the files */
     727      7346594 :     if (!ok && fdata)
     728              :     {
     729            0 :         for (int i = 0; i < fdata->num_files; i++)
     730            0 :             CloseTransientFile(fdata->fd[i]);
     731              :     }
     732              : 
     733              :     /* Re-acquire bank lock and update page state */
     734      7346594 :     LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
     735              : 
     736              :     Assert(shared->page_number[slotno] == pageno &&
     737              :            shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
     738              : 
     739              :     /* If we failed to write, mark the page dirty again */
     740      7346594 :     if (!ok)
     741            0 :         shared->page_dirty[slotno] = true;
     742              : 
     743      7346594 :     shared->page_status[slotno] = SLRU_PAGE_VALID;
     744              : 
     745      7346594 :     LWLockRelease(&shared->buffer_locks[slotno].lock);
     746              : 
     747              :     /* Now it's okay to ereport if we failed */
     748      7346594 :     if (!ok)
     749            0 :         SlruReportIOError(ctl, pageno, NULL);
     750              : 
     751              :     /* If part of a checkpoint, count this as a SLRU buffer written. */
     752      7346594 :     if (fdata)
     753              :     {
     754         3004 :         CheckpointStats.ckpt_slru_written++;
     755         3004 :         PendingCheckpointerStats.slru_written++;
     756              :     }
     757              : }
     758              : 
     759              : /*
     760              :  * Wrapper of SlruInternalWritePage, for external callers.
     761              :  * fdata is always passed a NULL here.
     762              :  */
     763              : void
     764          316 : SimpleLruWritePage(SlruCtl ctl, int slotno)
     765              : {
     766              :     Assert(ctl->shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     767              : 
     768          316 :     SlruInternalWritePage(ctl, slotno, NULL);
     769          316 : }
     770              : 
     771              : /*
     772              :  * Return whether the given page exists on disk.
     773              :  *
     774              :  * A false return means that either the file does not exist, or that it's not
     775              :  * large enough to contain the given page.
     776              :  */
     777              : bool
     778           66 : SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
     779              : {
     780           66 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     781           66 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     782           66 :     int         offset = rpageno * BLCKSZ;
     783              :     char        path[MAXPGPATH];
     784              :     int         fd;
     785              :     bool        result;
     786              :     off_t       endpos;
     787              : 
     788              :     /* update the stats counter of checked pages */
     789           66 :     pgstat_count_slru_blocks_exists(ctl->shared->slru_stats_idx);
     790              : 
     791           66 :     SlruFileName(ctl, path, segno);
     792              : 
     793           66 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
     794           66 :     if (fd < 0)
     795              :     {
     796              :         /* expected: file doesn't exist */
     797           26 :         if (errno == ENOENT)
     798           26 :             return false;
     799              : 
     800              :         /* report error normally */
     801            0 :         slru_errcause = SLRU_OPEN_FAILED;
     802            0 :         slru_errno = errno;
     803            0 :         SlruReportIOError(ctl, pageno, NULL);
     804              :     }
     805              : 
     806           40 :     if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
     807              :     {
     808            0 :         slru_errcause = SLRU_SEEK_FAILED;
     809            0 :         slru_errno = errno;
     810            0 :         SlruReportIOError(ctl, pageno, NULL);
     811              :     }
     812              : 
     813           40 :     result = endpos >= (off_t) (offset + BLCKSZ);
     814              : 
     815           40 :     if (CloseTransientFile(fd) != 0)
     816              :     {
     817            0 :         slru_errcause = SLRU_CLOSE_FAILED;
     818            0 :         slru_errno = errno;
     819            0 :         return false;
     820              :     }
     821              : 
     822           40 :     return result;
     823              : }
     824              : 
     825              : /*
     826              :  * Physical read of a (previously existing) page into a buffer slot
     827              :  *
     828              :  * On failure, we cannot just ereport(ERROR) since caller has put state in
     829              :  * shared memory that must be undone.  So, we return false and save enough
     830              :  * info in static variables to let SlruReportIOError make the report.
     831              :  *
     832              :  * For now, assume it's not worth keeping a file pointer open across
     833              :  * read/write operations.  We could cache one virtual file pointer ...
     834              :  */
     835              : static bool
     836        17596 : SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
     837              : {
     838        17596 :     SlruShared  shared = ctl->shared;
     839        17596 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     840        17596 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     841        17596 :     off_t       offset = rpageno * BLCKSZ;
     842              :     char        path[MAXPGPATH];
     843              :     int         fd;
     844              : 
     845        17596 :     SlruFileName(ctl, path, segno);
     846              : 
     847              :     /*
     848              :      * In a crash-and-restart situation, it's possible for us to receive
     849              :      * commands to set the commit status of transactions whose bits are in
     850              :      * already-truncated segments of the commit log (see notes in
     851              :      * SlruPhysicalWritePage).  Hence, if we are InRecovery, allow the case
     852              :      * where the file doesn't exist, and return zeroes instead.
     853              :      */
     854        17596 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
     855        17596 :     if (fd < 0)
     856              :     {
     857            1 :         if (errno != ENOENT || !InRecovery)
     858              :         {
     859            1 :             slru_errcause = SLRU_OPEN_FAILED;
     860            1 :             slru_errno = errno;
     861            1 :             return false;
     862              :         }
     863              : 
     864            0 :         ereport(LOG,
     865              :                 (errmsg("file \"%s\" doesn't exist, reading as zeroes",
     866              :                         path)));
     867            0 :         MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
     868            0 :         return true;
     869              :     }
     870              : 
     871        17595 :     errno = 0;
     872        17595 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
     873        17595 :     if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
     874              :     {
     875            0 :         pgstat_report_wait_end();
     876            0 :         slru_errcause = SLRU_READ_FAILED;
     877            0 :         slru_errno = errno;
     878            0 :         CloseTransientFile(fd);
     879            0 :         return false;
     880              :     }
     881        17595 :     pgstat_report_wait_end();
     882              : 
     883        17595 :     if (CloseTransientFile(fd) != 0)
     884              :     {
     885            0 :         slru_errcause = SLRU_CLOSE_FAILED;
     886            0 :         slru_errno = errno;
     887            0 :         return false;
     888              :     }
     889              : 
     890        17595 :     return true;
     891              : }
     892              : 
     893              : /*
     894              :  * Physical write of a page from a buffer slot
     895              :  *
     896              :  * On failure, we cannot just ereport(ERROR) since caller has put state in
     897              :  * shared memory that must be undone.  So, we return false and save enough
     898              :  * info in static variables to let SlruReportIOError make the report.
     899              :  *
     900              :  * For now, assume it's not worth keeping a file pointer open across
     901              :  * independent read/write operations.  We do batch operations during
     902              :  * SimpleLruWriteAll, though.
     903              :  *
     904              :  * fdata is NULL for a standalone write, pointer to open-file info during
     905              :  * SimpleLruWriteAll.
     906              :  */
     907              : static bool
     908      7346594 : SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
     909              : {
     910      7346594 :     SlruShared  shared = ctl->shared;
     911      7346594 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     912      7346594 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     913      7346594 :     off_t       offset = rpageno * BLCKSZ;
     914              :     char        path[MAXPGPATH];
     915      7346594 :     int         fd = -1;
     916              : 
     917              :     /* update the stats counter of written pages */
     918      7346594 :     pgstat_count_slru_blocks_written(shared->slru_stats_idx);
     919              : 
     920              :     /*
     921              :      * Honor the write-WAL-before-data rule, if appropriate, so that we do not
     922              :      * write out data before associated WAL records.  This is the same action
     923              :      * performed during FlushBuffer() in the main buffer manager.
     924              :      */
     925      7346594 :     if (shared->group_lsn != NULL)
     926              :     {
     927              :         /*
     928              :          * We must determine the largest async-commit LSN for the page. This
     929              :          * is a bit tedious, but since this entire function is a slow path
     930              :          * anyway, it seems better to do this here than to maintain a per-page
     931              :          * LSN variable (which'd need an extra comparison in the
     932              :          * transaction-commit path).
     933              :          */
     934              :         XLogRecPtr  max_lsn;
     935              :         int         lsnindex;
     936              : 
     937       433377 :         lsnindex = slotno * shared->lsn_groups_per_page;
     938       433377 :         max_lsn = shared->group_lsn[lsnindex++];
     939    443778048 :         for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
     940              :         {
     941    443344671 :             XLogRecPtr  this_lsn = shared->group_lsn[lsnindex++];
     942              : 
     943    443344671 :             if (max_lsn < this_lsn)
     944        50772 :                 max_lsn = this_lsn;
     945              :         }
     946              : 
     947       433377 :         if (XLogRecPtrIsValid(max_lsn))
     948              :         {
     949              :             /*
     950              :              * As noted above, elog(ERROR) is not acceptable here, so if
     951              :              * XLogFlush were to fail, we must PANIC.  This isn't much of a
     952              :              * restriction because XLogFlush is just about all critical
     953              :              * section anyway, but let's make sure.
     954              :              */
     955          546 :             START_CRIT_SECTION();
     956          546 :             XLogFlush(max_lsn);
     957          546 :             END_CRIT_SECTION();
     958              :         }
     959              :     }
     960              : 
     961              :     /*
     962              :      * During a SimpleLruWriteAll, we may already have the desired file open.
     963              :      */
     964      7346594 :     if (fdata)
     965              :     {
     966         3090 :         for (int i = 0; i < fdata->num_files; i++)
     967              :         {
     968          325 :             if (fdata->segno[i] == segno)
     969              :             {
     970          239 :                 fd = fdata->fd[i];
     971          239 :                 break;
     972              :             }
     973              :         }
     974              :     }
     975              : 
     976      7346594 :     if (fd < 0)
     977              :     {
     978              :         /*
     979              :          * If the file doesn't already exist, we should create it.  It is
     980              :          * possible for this to need to happen when writing a page that's not
     981              :          * first in its segment; we assume the OS can cope with that. (Note:
     982              :          * it might seem that it'd be okay to create files only when
     983              :          * SimpleLruZeroPage is called for the first page of a segment.
     984              :          * However, if after a crash and restart the REDO logic elects to
     985              :          * replay the log from a checkpoint before the latest one, then it's
     986              :          * possible that we will get commands to set transaction status of
     987              :          * transactions that have already been truncated from the commit log.
     988              :          * Easiest way to deal with that is to accept references to
     989              :          * nonexistent files here and in SlruPhysicalReadPage.)
     990              :          *
     991              :          * Note: it is possible for more than one backend to be executing this
     992              :          * code simultaneously for different pages of the same file. Hence,
     993              :          * don't use O_EXCL or O_TRUNC or anything like that.
     994              :          */
     995      7346355 :         SlruFileName(ctl, path, segno);
     996      7346355 :         fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
     997      7346355 :         if (fd < 0)
     998              :         {
     999            0 :             slru_errcause = SLRU_OPEN_FAILED;
    1000            0 :             slru_errno = errno;
    1001            0 :             return false;
    1002              :         }
    1003              : 
    1004      7346355 :         if (fdata)
    1005              :         {
    1006         2765 :             if (fdata->num_files < MAX_WRITEALL_BUFFERS)
    1007              :             {
    1008         2765 :                 fdata->fd[fdata->num_files] = fd;
    1009         2765 :                 fdata->segno[fdata->num_files] = segno;
    1010         2765 :                 fdata->num_files++;
    1011              :             }
    1012              :             else
    1013              :             {
    1014              :                 /*
    1015              :                  * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
    1016              :                  * fall back to treating it as a standalone write.
    1017              :                  */
    1018            0 :                 fdata = NULL;
    1019              :             }
    1020              :         }
    1021              :     }
    1022              : 
    1023      7346594 :     errno = 0;
    1024      7346594 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
    1025      7346594 :     if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
    1026              :     {
    1027            0 :         pgstat_report_wait_end();
    1028              :         /* if write didn't set errno, assume problem is no disk space */
    1029            0 :         if (errno == 0)
    1030            0 :             errno = ENOSPC;
    1031            0 :         slru_errcause = SLRU_WRITE_FAILED;
    1032            0 :         slru_errno = errno;
    1033            0 :         if (!fdata)
    1034            0 :             CloseTransientFile(fd);
    1035            0 :         return false;
    1036              :     }
    1037      7346594 :     pgstat_report_wait_end();
    1038              : 
    1039              :     /* Queue up a sync request for the checkpointer. */
    1040      7346594 :     if (ctl->sync_handler != SYNC_HANDLER_NONE)
    1041              :     {
    1042              :         FileTag     tag;
    1043              : 
    1044       434279 :         INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
    1045       434279 :         if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
    1046              :         {
    1047              :             /* No space to enqueue sync request.  Do it synchronously. */
    1048            6 :             pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
    1049            6 :             if (pg_fsync(fd) != 0)
    1050              :             {
    1051            0 :                 pgstat_report_wait_end();
    1052            0 :                 slru_errcause = SLRU_FSYNC_FAILED;
    1053            0 :                 slru_errno = errno;
    1054            0 :                 CloseTransientFile(fd);
    1055            0 :                 return false;
    1056              :             }
    1057            6 :             pgstat_report_wait_end();
    1058              :         }
    1059              :     }
    1060              : 
    1061              :     /* Close file, unless part of flush request. */
    1062      7346594 :     if (!fdata)
    1063              :     {
    1064      7343590 :         if (CloseTransientFile(fd) != 0)
    1065              :         {
    1066            0 :             slru_errcause = SLRU_CLOSE_FAILED;
    1067            0 :             slru_errno = errno;
    1068            0 :             return false;
    1069              :         }
    1070              :     }
    1071              : 
    1072      7346594 :     return true;
    1073              : }
    1074              : 
    1075              : /*
    1076              :  * Issue the error message after failure of SlruPhysicalReadPage or
    1077              :  * SlruPhysicalWritePage.  Call this after cleaning up shared-memory state.
    1078              :  */
    1079              : static void
    1080            1 : SlruReportIOError(SlruCtl ctl, int64 pageno, const void *opaque_data)
    1081              : {
    1082            1 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
    1083            1 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
    1084            1 :     int         offset = rpageno * BLCKSZ;
    1085              :     char        path[MAXPGPATH];
    1086              : 
    1087            1 :     SlruFileName(ctl, path, segno);
    1088            1 :     errno = slru_errno;
    1089            1 :     switch (slru_errcause)
    1090              :     {
    1091            1 :         case SLRU_OPEN_FAILED:
    1092            1 :             ereport(ERROR,
    1093              :                     (errcode_for_file_access(),
    1094              :                      errmsg("could not open file \"%s\": %m", path),
    1095              :                      opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
    1096              :             break;
    1097            0 :         case SLRU_SEEK_FAILED:
    1098            0 :             ereport(ERROR,
    1099              :                     (errcode_for_file_access(),
    1100              :                      errmsg("could not seek in file \"%s\" to offset %d: %m",
    1101              :                             path, offset),
    1102              :                      opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
    1103              :             break;
    1104            0 :         case SLRU_READ_FAILED:
    1105            0 :             if (errno)
    1106            0 :                 ereport(ERROR,
    1107              :                         (errcode_for_file_access(),
    1108              :                          errmsg("could not read from file \"%s\" at offset %d: %m",
    1109              :                                 path, offset),
    1110              :                          opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
    1111              :             else
    1112            0 :                 ereport(ERROR,
    1113              :                         (errmsg("could not read from file \"%s\" at offset %d: read too few bytes",
    1114              :                                 path, offset),
    1115              :                          opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
    1116              :             break;
    1117            0 :         case SLRU_WRITE_FAILED:
    1118            0 :             if (errno)
    1119            0 :                 ereport(ERROR,
    1120              :                         (errcode_for_file_access(),
    1121              :                          errmsg("Could not write to file \"%s\" at offset %d: %m",
    1122              :                                 path, offset),
    1123              :                          opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
    1124              :             else
    1125            0 :                 ereport(ERROR,
    1126              :                         (errmsg("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
    1127              :                                 path, offset),
    1128              :                          opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
    1129              :             break;
    1130            0 :         case SLRU_FSYNC_FAILED:
    1131            0 :             ereport(data_sync_elevel(ERROR),
    1132              :                     (errcode_for_file_access(),
    1133              :                      errmsg("could not fsync file \"%s\": %m",
    1134              :                             path),
    1135              :                      opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
    1136            0 :             break;
    1137            0 :         case SLRU_CLOSE_FAILED:
    1138            0 :             ereport(ERROR,
    1139              :                     (errcode_for_file_access(),
    1140              :                      errmsg("could not close file \"%s\": %m",
    1141              :                             path),
    1142              :                      opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
    1143              :             break;
    1144            0 :         default:
    1145              :             /* can't get here, we trust */
    1146            0 :             elog(ERROR, "unrecognized SimpleLru error cause: %d",
    1147              :                  (int) slru_errcause);
    1148              :             break;
    1149              :     }
    1150            0 : }
    1151              : 
    1152              : /*
    1153              :  * Mark a buffer slot "most recently used".
    1154              :  */
    1155              : static inline void
    1156      8604087 : SlruRecentlyUsed(SlruShared shared, int slotno)
    1157              : {
    1158      8604087 :     int         bankno = SlotGetBankNumber(slotno);
    1159      8604087 :     int         new_lru_count = shared->bank_cur_lru_count[bankno];
    1160              : 
    1161              :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
    1162              : 
    1163              :     /*
    1164              :      * The reason for the if-test is that there are often many consecutive
    1165              :      * accesses to the same page (particularly the latest page).  By
    1166              :      * suppressing useless increments of bank_cur_lru_count, we reduce the
    1167              :      * probability that old pages' counts will "wrap around" and make them
    1168              :      * appear recently used.
    1169              :      *
    1170              :      * We allow this code to be executed concurrently by multiple processes
    1171              :      * within SimpleLruReadPage_ReadOnly().  As long as int reads and writes
    1172              :      * are atomic, this should not cause any completely-bogus values to enter
    1173              :      * the computation.  However, it is possible for either bank_cur_lru_count
    1174              :      * or individual page_lru_count entries to be "reset" to lower values than
    1175              :      * they should have, in case a process is delayed while it executes this
    1176              :      * function.  With care in SlruSelectLRUPage(), this does little harm, and
    1177              :      * in any case the absolute worst possible consequence is a nonoptimal
    1178              :      * choice of page to evict.  The gain from allowing concurrent reads of
    1179              :      * SLRU pages seems worth it.
    1180              :      */
    1181      8604087 :     if (new_lru_count != shared->page_lru_count[slotno])
    1182              :     {
    1183      7484084 :         shared->bank_cur_lru_count[bankno] = ++new_lru_count;
    1184      7484084 :         shared->page_lru_count[slotno] = new_lru_count;
    1185              :     }
    1186      8604087 : }
    1187              : 
    1188              : /*
    1189              :  * Select the slot to re-use when we need a free slot for the given page.
    1190              :  *
    1191              :  * The target page number is passed not only because we need to know the
    1192              :  * correct bank to use, but also because we need to consider the possibility
    1193              :  * that some other process reads in the target page while we are doing I/O to
    1194              :  * free a slot.  Hence, check or recheck to see if any slot already holds the
    1195              :  * target page, and return that slot if so.  Thus, the returned slot is
    1196              :  * *either* a slot already holding the pageno (could be any state except
    1197              :  * EMPTY), *or* a freeable slot (state EMPTY or CLEAN).
    1198              :  *
    1199              :  * The correct bank lock must be held at entry, and will be held at exit.
    1200              :  */
    1201              : static int
    1202      7734802 : SlruSelectLRUPage(SlruCtl ctl, int64 pageno)
    1203              : {
    1204      7734802 :     SlruShared  shared = ctl->shared;
    1205              : 
    1206              :     /* Outer loop handles restart after I/O */
    1207              :     for (;;)
    1208      7343191 :     {
    1209              :         int         cur_count;
    1210     15077993 :         int         bestvalidslot = 0;  /* keep compiler quiet */
    1211     15077993 :         int         best_valid_delta = -1;
    1212     15077993 :         int64       best_valid_page_number = 0; /* keep compiler quiet */
    1213     15077993 :         int         bestinvalidslot = 0;    /* keep compiler quiet */
    1214     15077993 :         int         best_invalid_delta = -1;
    1215     15077993 :         int64       best_invalid_page_number = 0;   /* keep compiler quiet */
    1216     15077993 :         int         bankno = pageno % ctl->nbanks;
    1217     15077993 :         int         bankstart = bankno * SLRU_BANK_SIZE;
    1218     15077993 :         int         bankend = bankstart + SLRU_BANK_SIZE;
    1219              : 
    1220              :         Assert(LWLockHeldByMe(SimpleLruGetBankLock(ctl, pageno)));
    1221              : 
    1222              :         /* See if page already has a buffer assigned */
    1223    251020458 :         for (int slotno = bankstart; slotno < bankend; slotno++)
    1224              :         {
    1225    236313978 :             if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
    1226    236259506 :                 shared->page_number[slotno] == pageno)
    1227       371513 :                 return slotno;
    1228              :         }
    1229              : 
    1230              :         /*
    1231              :          * If we find any EMPTY slot, just select that one. Else choose a
    1232              :          * victim page to replace.  We normally take the least recently used
    1233              :          * valid page, but we will never take the slot containing
    1234              :          * latest_page_number, even if it appears least recently used.  We
    1235              :          * will select a slot that is already I/O busy only if there is no
    1236              :          * other choice: a read-busy slot will not be least recently used once
    1237              :          * the read finishes, and waiting for an I/O on a write-busy slot is
    1238              :          * inferior to just picking some other slot.  Testing shows the slot
    1239              :          * we pick instead will often be clean, allowing us to begin a read at
    1240              :          * once.
    1241              :          *
    1242              :          * Normally the page_lru_count values will all be different and so
    1243              :          * there will be a well-defined LRU page.  But since we allow
    1244              :          * concurrent execution of SlruRecentlyUsed() within
    1245              :          * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
    1246              :          * acquire the same lru_count values.  In that case we break ties by
    1247              :          * choosing the furthest-back page.
    1248              :          *
    1249              :          * Notice that this next line forcibly advances cur_lru_count to a
    1250              :          * value that is certainly beyond any value that will be in the
    1251              :          * page_lru_count array after the loop finishes.  This ensures that
    1252              :          * the next execution of SlruRecentlyUsed will mark the page newly
    1253              :          * used, even if it's for a page that has the current counter value.
    1254              :          * That gets us back on the path to having good data when there are
    1255              :          * multiple pages with the same lru_count.
    1256              :          */
    1257     14706480 :         cur_count = (shared->bank_cur_lru_count[bankno])++;
    1258    249955757 :         for (int slotno = bankstart; slotno < bankend; slotno++)
    1259              :         {
    1260              :             int         this_delta;
    1261              :             int64       this_page_number;
    1262              : 
    1263    235252869 :             if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1264         3592 :                 return slotno;
    1265              : 
    1266    235249277 :             this_delta = cur_count - shared->page_lru_count[slotno];
    1267    235249277 :             if (this_delta < 0)
    1268              :             {
    1269              :                 /*
    1270              :                  * Clean up in case shared updates have caused cur_count
    1271              :                  * increments to get "lost".  We back off the page counts,
    1272              :                  * rather than trying to increase cur_count, to avoid any
    1273              :                  * question of infinite loops or failure in the presence of
    1274              :                  * wrapped-around counts.
    1275              :                  */
    1276            0 :                 shared->page_lru_count[slotno] = cur_count;
    1277            0 :                 this_delta = 0;
    1278              :             }
    1279              : 
    1280              :             /*
    1281              :              * If this page is the one most recently zeroed, don't consider it
    1282              :              * an eviction candidate. See comments in SimpleLruZeroPage for an
    1283              :              * explanation about the lack of a memory barrier here.
    1284              :              */
    1285    235249277 :             this_page_number = shared->page_number[slotno];
    1286    235249277 :             if (this_page_number ==
    1287    235249277 :                 pg_atomic_read_u64(&shared->latest_page_number))
    1288         8911 :                 continue;
    1289              : 
    1290    235240366 :             if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1291              :             {
    1292    235239910 :                 if (this_delta > best_valid_delta ||
    1293            0 :                     (this_delta == best_valid_delta &&
    1294            0 :                      ctl->PagePrecedes(this_page_number,
    1295              :                                        best_valid_page_number)))
    1296              :                 {
    1297     33198436 :                     bestvalidslot = slotno;
    1298     33198436 :                     best_valid_delta = this_delta;
    1299     33198436 :                     best_valid_page_number = this_page_number;
    1300              :                 }
    1301              :             }
    1302              :             else
    1303              :             {
    1304          456 :                 if (this_delta > best_invalid_delta ||
    1305            0 :                     (this_delta == best_invalid_delta &&
    1306            0 :                      ctl->PagePrecedes(this_page_number,
    1307              :                                        best_invalid_page_number)))
    1308              :                 {
    1309          456 :                     bestinvalidslot = slotno;
    1310          456 :                     best_invalid_delta = this_delta;
    1311          456 :                     best_invalid_page_number = this_page_number;
    1312              :                 }
    1313              :             }
    1314              :         }
    1315              : 
    1316              :         /*
    1317              :          * If all pages (except possibly the latest one) are I/O busy, we'll
    1318              :          * have to wait for an I/O to complete and then retry.  In that
    1319              :          * unhappy case, we choose to wait for the I/O on the least recently
    1320              :          * used slot, on the assumption that it was likely initiated first of
    1321              :          * all the I/Os in progress and may therefore finish first.
    1322              :          */
    1323     14702888 :         if (best_valid_delta < 0)
    1324              :         {
    1325            0 :             SimpleLruWaitIO(ctl, bestinvalidslot);
    1326            0 :             continue;
    1327              :         }
    1328              : 
    1329              :         /*
    1330              :          * If the selected page is clean, we're set.
    1331              :          */
    1332     14702888 :         if (!shared->page_dirty[bestvalidslot])
    1333      7359697 :             return bestvalidslot;
    1334              : 
    1335              :         /*
    1336              :          * Write the page.
    1337              :          */
    1338      7343191 :         SlruInternalWritePage(ctl, bestvalidslot, NULL);
    1339              : 
    1340              :         /*
    1341              :          * Now loop back and try again.  This is the easiest way of dealing
    1342              :          * with corner cases such as the victim page being re-dirtied while we
    1343              :          * wrote it.
    1344              :          */
    1345              :     }
    1346              : }
    1347              : 
    1348              : /*
    1349              :  * Write dirty pages to disk during checkpoint or database shutdown.  Flushing
    1350              :  * is deferred until the next call to ProcessSyncRequests(), though we do fsync
    1351              :  * the containing directory here to make sure that newly created directory
    1352              :  * entries are on disk.
    1353              :  */
    1354              : void
    1355         9227 : SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
    1356              : {
    1357         9227 :     SlruShared  shared = ctl->shared;
    1358              :     SlruWriteAllData fdata;
    1359         9227 :     int64       pageno = 0;
    1360         9227 :     int         prevbank = SlotGetBankNumber(0);
    1361              :     bool        ok;
    1362              : 
    1363              :     /* update the stats counter of flushes */
    1364         9227 :     pgstat_count_slru_flush(shared->slru_stats_idx);
    1365              : 
    1366              :     /*
    1367              :      * Find and write dirty pages
    1368              :      */
    1369         9227 :     fdata.num_files = 0;
    1370              : 
    1371         9227 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1372              : 
    1373       224251 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1374              :     {
    1375       215024 :         int         curbank = SlotGetBankNumber(slotno);
    1376              : 
    1377              :         /*
    1378              :          * If the current bank lock is not same as the previous bank lock then
    1379              :          * release the previous lock and acquire the new lock.
    1380              :          */
    1381       215024 :         if (curbank != prevbank)
    1382              :         {
    1383         4212 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1384         4212 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1385         4212 :             prevbank = curbank;
    1386              :         }
    1387              : 
    1388              :         /* Do nothing if slot is unused */
    1389       215024 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1390       208698 :             continue;
    1391              : 
    1392         6326 :         SlruInternalWritePage(ctl, slotno, &fdata);
    1393              : 
    1394              :         /*
    1395              :          * In some places (e.g. checkpoints), we cannot assert that the slot
    1396              :          * is clean now, since another process might have re-dirtied it
    1397              :          * already.  That's okay.
    1398              :          */
    1399              :         Assert(allow_redirtied ||
    1400              :                shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
    1401              :                (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1402              :                 !shared->page_dirty[slotno]));
    1403              :     }
    1404              : 
    1405         9227 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1406              : 
    1407              :     /*
    1408              :      * Now close any files that were open
    1409              :      */
    1410         9227 :     ok = true;
    1411        11992 :     for (int i = 0; i < fdata.num_files; i++)
    1412              :     {
    1413         2765 :         if (CloseTransientFile(fdata.fd[i]) != 0)
    1414              :         {
    1415            0 :             slru_errcause = SLRU_CLOSE_FAILED;
    1416            0 :             slru_errno = errno;
    1417            0 :             pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
    1418            0 :             ok = false;
    1419              :         }
    1420              :     }
    1421         9227 :     if (!ok)
    1422            0 :         SlruReportIOError(ctl, pageno, NULL);
    1423              : 
    1424              :     /* Ensure that directory entries for new files are on disk. */
    1425         9227 :     if (ctl->sync_handler != SYNC_HANDLER_NONE)
    1426         7388 :         fsync_fname(ctl->Dir, true);
    1427         9227 : }
    1428              : 
    1429              : /*
    1430              :  * Remove all segments before the one holding the passed page number
    1431              :  *
    1432              :  * All SLRUs prevent concurrent calls to this function, either with an LWLock
    1433              :  * or by calling it only as part of a checkpoint.  Mutual exclusion must begin
    1434              :  * before computing cutoffPage.  Mutual exclusion must end after any limit
    1435              :  * update that would permit other backends to write fresh data into the
    1436              :  * segment immediately preceding the one containing cutoffPage.  Otherwise,
    1437              :  * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
    1438              :  * after it has accrued freshly-written data.
    1439              :  */
    1440              : void
    1441         1911 : SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
    1442              : {
    1443         1911 :     SlruShared  shared = ctl->shared;
    1444              :     int         prevbank;
    1445              : 
    1446              :     /* update the stats counter of truncates */
    1447         1911 :     pgstat_count_slru_truncate(shared->slru_stats_idx);
    1448              : 
    1449              :     /*
    1450              :      * Scan shared memory and remove any pages preceding the cutoff page, to
    1451              :      * ensure we won't rewrite them later.  (Since this is normally called in
    1452              :      * or just after a checkpoint, any dirty pages should have been flushed
    1453              :      * already ... we're just being extra careful here.)
    1454              :      */
    1455         1994 : restart:
    1456              : 
    1457              :     /*
    1458              :      * An important safety check: the current endpoint page must not be
    1459              :      * eligible for removal.  This check is just a backstop against wraparound
    1460              :      * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
    1461              :      * outdated value; therefore we don't add a memory barrier.
    1462              :      */
    1463         1994 :     if (ctl->PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number),
    1464              :                           cutoffPage))
    1465              :     {
    1466            0 :         ereport(LOG,
    1467              :                 (errmsg("could not truncate directory \"%s\": apparent wraparound",
    1468              :                         ctl->Dir)));
    1469            0 :         return;
    1470              :     }
    1471              : 
    1472         1994 :     prevbank = SlotGetBankNumber(0);
    1473         1994 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1474        47831 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1475              :     {
    1476        45920 :         int         curbank = SlotGetBankNumber(slotno);
    1477              : 
    1478              :         /*
    1479              :          * If the current bank lock is not same as the previous bank lock then
    1480              :          * release the previous lock and acquire the new lock.
    1481              :          */
    1482        45920 :         if (curbank != prevbank)
    1483              :         {
    1484          917 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1485          917 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1486          917 :             prevbank = curbank;
    1487              :         }
    1488              : 
    1489        45920 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1490        40497 :             continue;
    1491         5423 :         if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
    1492         5116 :             continue;
    1493              : 
    1494              :         /*
    1495              :          * If page is clean, just change state to EMPTY (expected case).
    1496              :          */
    1497          307 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1498          307 :             !shared->page_dirty[slotno])
    1499              :         {
    1500          224 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
    1501          224 :             continue;
    1502              :         }
    1503              : 
    1504              :         /*
    1505              :          * Hmm, we have (or may have) I/O operations acting on the page, so
    1506              :          * we've got to wait for them to finish and then start again. This is
    1507              :          * the same logic as in SlruSelectLRUPage.  (XXX if page is dirty,
    1508              :          * wouldn't it be OK to just discard it without writing it?
    1509              :          * SlruMayDeleteSegment() uses a stricter qualification, so we might
    1510              :          * not delete this page in the end; even if we don't delete it, we
    1511              :          * won't have cause to read its data again.  For now, keep the logic
    1512              :          * the same as it was.)
    1513              :          */
    1514           83 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1515           83 :             SlruInternalWritePage(ctl, slotno, NULL);
    1516              :         else
    1517            0 :             SimpleLruWaitIO(ctl, slotno);
    1518              : 
    1519           83 :         LWLockRelease(&shared->bank_locks[prevbank].lock);
    1520           83 :         goto restart;
    1521              :     }
    1522              : 
    1523         1911 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1524              : 
    1525              :     /* Now we can remove the old segment(s) */
    1526         1911 :     (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
    1527              : }
    1528              : 
    1529              : /*
    1530              :  * Delete an individual SLRU segment.
    1531              :  *
    1532              :  * NB: This does not touch the SLRU buffers themselves, callers have to ensure
    1533              :  * they either can't yet contain anything, or have already been cleaned out.
    1534              :  */
    1535              : static void
    1536       142003 : SlruInternalDeleteSegment(SlruCtl ctl, int64 segno)
    1537              : {
    1538              :     char        path[MAXPGPATH];
    1539              : 
    1540              :     /* Forget any fsync requests queued for this segment. */
    1541       142003 :     if (ctl->sync_handler != SYNC_HANDLER_NONE)
    1542              :     {
    1543              :         FileTag     tag;
    1544              : 
    1545        13224 :         INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
    1546        13224 :         RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
    1547              :     }
    1548              : 
    1549              :     /* Unlink the file. */
    1550       142003 :     SlruFileName(ctl, path, segno);
    1551       142003 :     ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
    1552       142003 :     unlink(path);
    1553       142003 : }
    1554              : 
    1555              : /*
    1556              :  * Delete an individual SLRU segment, identified by the segment number.
    1557              :  */
    1558              : void
    1559            2 : SlruDeleteSegment(SlruCtl ctl, int64 segno)
    1560              : {
    1561            2 :     SlruShared  shared = ctl->shared;
    1562            2 :     int         prevbank = SlotGetBankNumber(0);
    1563              :     bool        did_write;
    1564              : 
    1565              :     /* Clean out any possibly existing references to the segment. */
    1566            2 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1567            2 : restart:
    1568            2 :     did_write = false;
    1569           34 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1570              :     {
    1571              :         int64       pagesegno;
    1572           32 :         int         curbank = SlotGetBankNumber(slotno);
    1573              : 
    1574              :         /*
    1575              :          * If the current bank lock is not same as the previous bank lock then
    1576              :          * release the previous lock and acquire the new lock.
    1577              :          */
    1578           32 :         if (curbank != prevbank)
    1579              :         {
    1580            0 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1581            0 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1582            0 :             prevbank = curbank;
    1583              :         }
    1584              : 
    1585           32 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1586            0 :             continue;
    1587              : 
    1588           32 :         pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
    1589              :         /* not the segment we're looking for */
    1590           32 :         if (pagesegno != segno)
    1591            7 :             continue;
    1592              : 
    1593              :         /* If page is clean, just change state to EMPTY (expected case). */
    1594           25 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1595           25 :             !shared->page_dirty[slotno])
    1596              :         {
    1597           25 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
    1598           25 :             continue;
    1599              :         }
    1600              : 
    1601              :         /* Same logic as SimpleLruTruncate() */
    1602            0 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1603            0 :             SlruInternalWritePage(ctl, slotno, NULL);
    1604              :         else
    1605            0 :             SimpleLruWaitIO(ctl, slotno);
    1606              : 
    1607            0 :         did_write = true;
    1608              :     }
    1609              : 
    1610              :     /*
    1611              :      * Be extra careful and re-check. The IO functions release the control
    1612              :      * lock, so new pages could have been read in.
    1613              :      */
    1614            2 :     if (did_write)
    1615            0 :         goto restart;
    1616              : 
    1617            2 :     SlruInternalDeleteSegment(ctl, segno);
    1618              : 
    1619            2 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1620            2 : }
    1621              : 
    1622              : /*
    1623              :  * Determine whether a segment is okay to delete.
    1624              :  *
    1625              :  * segpage is the first page of the segment, and cutoffPage is the oldest (in
    1626              :  * PagePrecedes order) page in the SLRU containing still-useful data.  Since
    1627              :  * every core PagePrecedes callback implements "wrap around", check the
    1628              :  * segment's first and last pages:
    1629              :  *
    1630              :  * first<cutoff  && last<cutoff:  yes
    1631              :  * first<cutoff  && last>=cutoff: no; cutoff falls inside this segment
    1632              :  * first>=cutoff && last<cutoff:  no; wrap point falls inside this segment
    1633              :  * first>=cutoff && last>=cutoff: no; every page of this segment is too young
    1634              :  */
    1635              : static bool
    1636      1044096 : SlruMayDeleteSegment(SlruCtl ctl, int64 segpage, int64 cutoffPage)
    1637              : {
    1638      1044096 :     int64       seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
    1639              : 
    1640              :     Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
    1641              : 
    1642      1186803 :     return (ctl->PagePrecedes(segpage, cutoffPage) &&
    1643       142707 :             ctl->PagePrecedes(seg_last_page, cutoffPage));
    1644              : }
    1645              : 
    1646              : #ifdef USE_ASSERT_CHECKING
    1647              : static void
    1648              : SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
    1649              : {
    1650              :     TransactionId lhs,
    1651              :                 rhs;
    1652              :     int64       newestPage,
    1653              :                 oldestPage;
    1654              :     TransactionId newestXact,
    1655              :                 oldestXact;
    1656              : 
    1657              :     /*
    1658              :      * Compare an XID pair having undefined order (see RFC 1982), a pair at
    1659              :      * "opposite ends" of the XID space.  TransactionIdPrecedes() treats each
    1660              :      * as preceding the other.  If RHS is oldestXact, LHS is the first XID we
    1661              :      * must not assign.
    1662              :      */
    1663              :     lhs = per_page + offset;    /* skip first page to avoid non-normal XIDs */
    1664              :     rhs = lhs + (1U << 31);
    1665              :     Assert(TransactionIdPrecedes(lhs, rhs));
    1666              :     Assert(TransactionIdPrecedes(rhs, lhs));
    1667              :     Assert(!TransactionIdPrecedes(lhs - 1, rhs));
    1668              :     Assert(TransactionIdPrecedes(rhs, lhs - 1));
    1669              :     Assert(TransactionIdPrecedes(lhs + 1, rhs));
    1670              :     Assert(!TransactionIdPrecedes(rhs, lhs + 1));
    1671              :     Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
    1672              :     Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
    1673              :     Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
    1674              :     Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
    1675              :     Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
    1676              :     Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
    1677              :     Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
    1678              :     Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
    1679              :     Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
    1680              :            || (1U << 31) % per_page != 0);    /* See CommitTsPagePrecedes() */
    1681              :     Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
    1682              :            || (1U << 31) % per_page != 0);
    1683              :     Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
    1684              :     Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
    1685              :     Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
    1686              : 
    1687              :     /*
    1688              :      * GetNewTransactionId() has assigned the last XID it can safely use, and
    1689              :      * that XID is in the *LAST* page of the second segment.  We must not
    1690              :      * delete that segment.
    1691              :      */
    1692              :     newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
    1693              :     newestXact = newestPage * per_page + offset;
    1694              :     Assert(newestXact / per_page == newestPage);
    1695              :     oldestXact = newestXact + 1;
    1696              :     oldestXact -= 1U << 31;
    1697              :     oldestPage = oldestXact / per_page;
    1698              :     Assert(!SlruMayDeleteSegment(ctl,
    1699              :                                  (newestPage -
    1700              :                                   newestPage % SLRU_PAGES_PER_SEGMENT),
    1701              :                                  oldestPage));
    1702              : 
    1703              :     /*
    1704              :      * GetNewTransactionId() has assigned the last XID it can safely use, and
    1705              :      * that XID is in the *FIRST* page of the second segment.  We must not
    1706              :      * delete that segment.
    1707              :      */
    1708              :     newestPage = SLRU_PAGES_PER_SEGMENT;
    1709              :     newestXact = newestPage * per_page + offset;
    1710              :     Assert(newestXact / per_page == newestPage);
    1711              :     oldestXact = newestXact + 1;
    1712              :     oldestXact -= 1U << 31;
    1713              :     oldestPage = oldestXact / per_page;
    1714              :     Assert(!SlruMayDeleteSegment(ctl,
    1715              :                                  (newestPage -
    1716              :                                   newestPage % SLRU_PAGES_PER_SEGMENT),
    1717              :                                  oldestPage));
    1718              : }
    1719              : 
    1720              : /*
    1721              :  * Unit-test a PagePrecedes function.
    1722              :  *
    1723              :  * This assumes every uint32 >= FirstNormalTransactionId is a valid key.  It
    1724              :  * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
    1725              :  * (MultiXactMemberCtl separates flags from XIDs.  NotifyCtl has
    1726              :  * variable-length entries, no keys, and no random access.  These unit tests
    1727              :  * do not apply to them.)
    1728              :  */
    1729              : void
    1730              : SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
    1731              : {
    1732              :     /* Test first, middle and last entries of a page. */
    1733              :     SlruPagePrecedesTestOffset(ctl, per_page, 0);
    1734              :     SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
    1735              :     SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
    1736              : }
    1737              : #endif
    1738              : 
    1739              : /*
    1740              :  * SlruScanDirectory callback
    1741              :  *      This callback reports true if there's any segment wholly prior to the
    1742              :  *      one containing the page passed as "data".
    1743              :  */
    1744              : bool
    1745       834300 : SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int64 segpage,
    1746              :                             void *data)
    1747              : {
    1748       834300 :     int64       cutoffPage = *(int64 *) data;
    1749              : 
    1750       834300 :     if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
    1751          101 :         return true;            /* found one; don't iterate any more */
    1752              : 
    1753       834199 :     return false;               /* keep going */
    1754              : }
    1755              : 
    1756              : /*
    1757              :  * SlruScanDirectory callback.
    1758              :  *      This callback deletes segments prior to the one passed in as "data".
    1759              :  */
    1760              : static bool
    1761       209796 : SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int64 segpage,
    1762              :                           void *data)
    1763              : {
    1764       209796 :     int64       cutoffPage = *(int64 *) data;
    1765              : 
    1766       209796 :     if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
    1767       141993 :         SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
    1768              : 
    1769       209796 :     return false;               /* keep going */
    1770              : }
    1771              : 
    1772              : /*
    1773              :  * SlruScanDirectory callback.
    1774              :  *      This callback deletes all segments.
    1775              :  */
    1776              : bool
    1777            8 : SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int64 segpage, void *data)
    1778              : {
    1779            8 :     SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
    1780              : 
    1781            8 :     return false;               /* keep going */
    1782              : }
    1783              : 
    1784              : /*
    1785              :  * An internal function used by SlruScanDirectory().
    1786              :  *
    1787              :  * Returns true if a file with a name of a given length may be a correct
    1788              :  * SLRU segment.
    1789              :  */
    1790              : static inline bool
    1791      1056318 : SlruCorrectSegmentFilenameLength(SlruCtl ctl, size_t len)
    1792              : {
    1793      1056318 :     if (ctl->long_segment_names)
    1794         2375 :         return (len == 15);     /* see SlruFileName() */
    1795              :     else
    1796              : 
    1797              :         /*
    1798              :          * Commit 638cf09e76d allowed 5-character lengths. Later commit
    1799              :          * 73c986adde5 allowed 6-character length.
    1800              :          *
    1801              :          * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
    1802              :          * numbers, and the corresponding 15-character file names, which may
    1803              :          * eventually deprecate the support for 4, 5, and 6-character names.
    1804              :          */
    1805      1053943 :         return (len == 4 || len == 5 || len == 6);
    1806              : }
    1807              : 
    1808              : /*
    1809              :  * Scan the SimpleLru directory and apply a callback to each file found in it.
    1810              :  *
    1811              :  * If the callback returns true, the scan is stopped.  The last return value
    1812              :  * from the callback is returned.
    1813              :  *
    1814              :  * The callback receives the following arguments: 1. the SlruCtl struct for the
    1815              :  * slru being truncated; 2. the filename being considered; 3. the page number
    1816              :  * for the first page of that file; 4. a pointer to the opaque data given to us
    1817              :  * by the caller.
    1818              :  *
    1819              :  * Note that the ordering in which the directory is scanned is not guaranteed.
    1820              :  *
    1821              :  * Note that no locking is applied.
    1822              :  */
    1823              : bool
    1824         6174 : SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
    1825              : {
    1826         6174 :     bool        retval = false;
    1827              :     DIR        *cldir;
    1828              :     struct dirent *clde;
    1829              :     int64       segno;
    1830              :     int64       segpage;
    1831              : 
    1832         6174 :     cldir = AllocateDir(ctl->Dir);
    1833      1062391 :     while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
    1834              :     {
    1835              :         size_t      len;
    1836              : 
    1837      1056318 :         len = strlen(clde->d_name);
    1838              : 
    1839      1056318 :         if (SlruCorrectSegmentFilenameLength(ctl, len) &&
    1840      1044104 :             strspn(clde->d_name, "0123456789ABCDEF") == len)
    1841              :         {
    1842      1044104 :             segno = strtoi64(clde->d_name, NULL, 16);
    1843      1044104 :             segpage = segno * SLRU_PAGES_PER_SEGMENT;
    1844              : 
    1845      1044104 :             elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
    1846              :                  ctl->Dir, clde->d_name);
    1847      1044104 :             retval = callback(ctl, clde->d_name, segpage, data);
    1848      1044104 :             if (retval)
    1849          101 :                 break;
    1850              :         }
    1851              :     }
    1852         6174 :     FreeDir(cldir);
    1853              : 
    1854         6174 :     return retval;
    1855              : }
    1856              : 
    1857              : /*
    1858              :  * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
    1859              :  * that they can provide the correct "SlruCtl" (otherwise we don't know how to
    1860              :  * build the path), but they just forward to this common implementation that
    1861              :  * performs the fsync.
    1862              :  */
    1863              : int
    1864            2 : SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
    1865              : {
    1866              :     int         fd;
    1867              :     int         save_errno;
    1868              :     int         result;
    1869              : 
    1870            2 :     SlruFileName(ctl, path, ftag->segno);
    1871              : 
    1872            2 :     fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
    1873            2 :     if (fd < 0)
    1874            0 :         return -1;
    1875              : 
    1876            2 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
    1877            2 :     result = pg_fsync(fd);
    1878            2 :     pgstat_report_wait_end();
    1879            2 :     save_errno = errno;
    1880              : 
    1881            2 :     CloseTransientFile(fd);
    1882              : 
    1883            2 :     errno = save_errno;
    1884            2 :     return result;
    1885              : }
        

Generated by: LCOV version 2.0-1