LCOV - code coverage report
Current view: top level - src/backend/access/transam - slru.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 81.1 % 557 452
Test Date: 2026-04-16 15:16:24 Functions: 96.9 % 32 31
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * slru.c
       4              :  *      Simple LRU buffering for wrap-around-able permanent metadata
       5              :  *
       6              :  * This module is used to maintain various pieces of transaction status
       7              :  * indexed by TransactionId (such as commit status, parent transaction ID,
       8              :  * commit timestamp), as well as storage for multixacts, serializable
       9              :  * isolation locks and NOTIFY traffic.  Extensions can define their own
      10              :  * SLRUs, too.
      11              :  *
      12              :  * Under ordinary circumstances we expect that write traffic will occur
      13              :  * mostly to the latest page (and to the just-prior page, soon after a
      14              :  * page transition).  Read traffic will probably touch a larger span of
      15              :  * pages, but a relatively small number of buffers should be sufficient.
      16              :  *
      17              :  * We use a simple least-recently-used scheme to manage a pool of shared
      18              :  * page buffers, split in banks by the lowest bits of the page number, and
      19              :  * the management algorithm only processes the bank to which the desired
      20              :  * page belongs, so a linear search is sufficient; there's no need for a
      21              :  * hashtable or anything fancy.  The algorithm is straight LRU except that
      22              :  * we will never swap out the latest page (since we know it's going to be
      23              :  * hit again eventually).
      24              :  *
      25              :  * We use per-bank control LWLocks to protect the shared data structures,
      26              :  * plus per-buffer LWLocks that synchronize I/O for each buffer.  The
      27              :  * bank's control lock must be held to examine or modify any of the bank's
      28              :  * shared state.  A process that is reading in or writing out a page
      29              :  * buffer does not hold the control lock, only the per-buffer lock for the
      30              :  * buffer it is working on.  One exception is latest_page_number, which is
      31              :  * read and written using atomic ops.
      32              :  *
      33              :  * "Holding the bank control lock" means exclusive lock in all cases
      34              :  * except for SimpleLruReadPage_ReadOnly(); see comments for
      35              :  * SlruRecentlyUsed() for the implications of that.
      36              :  *
      37              :  * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
      38              :  * before releasing the control lock.  The per-buffer lock is released after
      39              :  * completing the I/O, re-acquiring the control lock, and updating the shared
      40              :  * state.  (Deadlock is not possible here, because we never try to initiate
      41              :  * I/O when someone else is already doing I/O on the same buffer.)
      42              :  * To wait for I/O to complete, release the control lock, acquire the
      43              :  * per-buffer lock in shared mode, immediately release the per-buffer lock,
      44              :  * reacquire the control lock, and then recheck state (since arbitrary things
      45              :  * could have happened while we didn't have the lock).
      46              :  *
      47              :  * As with the regular buffer manager, it is possible for another process
      48              :  * to re-dirty a page that is currently being written out.  This is handled
      49              :  * by re-setting the page's page_dirty flag.
      50              :  *
      51              :  *
      52              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      53              :  * Portions Copyright (c) 1994, Regents of the University of California
      54              :  *
      55              :  * src/backend/access/transam/slru.c
      56              :  *
      57              :  *-------------------------------------------------------------------------
      58              :  */
      59              : #include "postgres.h"
      60              : 
      61              : #include <fcntl.h>
      62              : #include <sys/stat.h>
      63              : #include <unistd.h>
      64              : 
      65              : #include "access/slru.h"
      66              : #include "access/transam.h"
      67              : #include "access/xlog.h"
      68              : #include "access/xlogutils.h"
      69              : #include "miscadmin.h"
      70              : #include "pgstat.h"
      71              : #include "storage/fd.h"
      72              : #include "storage/shmem.h"
      73              : #include "storage/shmem_internal.h"
      74              : #include "utils/guc.h"
      75              : #include "utils/memutils.h"
      76              : #include "utils/wait_event.h"
      77              : 
      78              : /*
      79              :  * Converts segment number to the filename of the segment.
      80              :  *
      81              :  * "path" should point to a buffer at least MAXPGPATH characters long.
      82              :  *
      83              :  * If ctl->long_segment_names is true, segno can be in the range [0, 2^60-1].
      84              :  * The resulting file name is made of 15 characters, e.g. dir/123456789ABCDEF.
      85              :  *
      86              :  * If ctl->long_segment_names is false, segno can be in the range [0, 2^24-1].
      87              :  * The resulting file name is made of 4 to 6 characters, as of:
      88              :  *
      89              :  *  dir/1234   for [0, 2^16-1]
      90              :  *  dir/12345  for [2^16, 2^20-1]
      91              :  *  dir/123456 for [2^20, 2^24-1]
      92              :  */
      93              : static inline int
      94      7506235 : SlruFileName(SlruDesc *ctl, char *path, int64 segno)
      95              : {
      96      7506235 :     if (ctl->options.long_segment_names)
      97              :     {
      98              :         /*
      99              :          * We could use 16 characters here but the disadvantage would be that
     100              :          * the SLRU segments will be hard to distinguish from WAL segments.
     101              :          *
     102              :          * For this reason we use 15 characters. It is enough but also means
     103              :          * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
     104              :          */
     105              :         Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
     106        16622 :         return snprintf(path, MAXPGPATH, "%s/%015" PRIX64, ctl->options.Dir, segno);
     107              :     }
     108              :     else
     109              :     {
     110              :         /*
     111              :          * Despite the fact that %04X format string is used up to 24 bit
     112              :          * integers are allowed. See SlruCorrectSegmentFilenameLength()
     113              :          */
     114              :         Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
     115      7489613 :         return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->options.Dir,
     116              :                         (unsigned int) segno);
     117              :     }
     118              : }
     119              : 
     120              : /*
     121              :  * During SimpleLruWriteAll(), we will usually not need to write more than one
     122              :  * or two physical files, but we may need to write several pages per file.  We
     123              :  * can consolidate the I/O requests by leaving files open until control returns
     124              :  * to SimpleLruWriteAll().  This data structure remembers which files are open.
     125              :  */
     126              : #define MAX_WRITEALL_BUFFERS    16
     127              : 
     128              : typedef struct SlruWriteAllData
     129              : {
     130              :     int         num_files;      /* # files actually open */
     131              :     int         fd[MAX_WRITEALL_BUFFERS];   /* their FD's */
     132              :     int64       segno[MAX_WRITEALL_BUFFERS];    /* their log seg#s */
     133              : } SlruWriteAllData;
     134              : 
     135              : typedef struct SlruWriteAllData *SlruWriteAll;
     136              : 
     137              : 
     138              : /*
     139              :  * Bank size for the slot array.  Pages are assigned a bank according to their
     140              :  * page number, with each bank being this size.  We want a power of 2 so that
     141              :  * we can determine the bank number for a page with just bit shifting; we also
     142              :  * want to keep the bank size small so that LRU victim search is fast.  16
     143              :  * buffers per bank seems a good number.
     144              :  */
     145              : #define SLRU_BANK_BITSHIFT      4
     146              : #define SLRU_BANK_SIZE          (1 << SLRU_BANK_BITSHIFT)
     147              : 
     148              : /*
     149              :  * Macro to get the bank number to which the slot belongs.
     150              :  */
     151              : #define SlotGetBankNumber(slotno)   ((slotno) >> SLRU_BANK_BITSHIFT)
     152              : 
     153              : 
     154              : /*
     155              :  * Populate a file tag describing a segment file.  We only use the segment
     156              :  * number, since we can derive everything else we need by having separate
     157              :  * sync handler functions for clog, multixact etc.
     158              :  */
     159              : #define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
     160              : ( \
     161              :     memset(&(a), 0, sizeof(FileTag)), \
     162              :     (a).handler = (xx_handler), \
     163              :     (a).segno = (xx_segno) \
     164              : )
     165              : 
     166              : /* Saved info for SlruReportIOError */
     167              : typedef enum
     168              : {
     169              :     SLRU_OPEN_FAILED,
     170              :     SLRU_SEEK_FAILED,
     171              :     SLRU_READ_FAILED,
     172              :     SLRU_WRITE_FAILED,
     173              :     SLRU_FSYNC_FAILED,
     174              :     SLRU_CLOSE_FAILED,
     175              : } SlruErrorCause;
     176              : 
     177              : static SlruErrorCause slru_errcause;
     178              : static int  slru_errno;
     179              : 
     180              : 
     181              : static void SimpleLruZeroLSNs(SlruDesc *ctl, int slotno);
     182              : static void SimpleLruWaitIO(SlruDesc *ctl, int slotno);
     183              : static void SlruInternalWritePage(SlruDesc *ctl, int slotno, SlruWriteAll fdata);
     184              : static bool SlruPhysicalReadPage(SlruDesc *ctl, int64 pageno, int slotno);
     185              : static bool SlruPhysicalWritePage(SlruDesc *ctl, int64 pageno, int slotno,
     186              :                                   SlruWriteAll fdata);
     187              : static void SlruReportIOError(SlruDesc *ctl, int64 pageno,
     188              :                               const void *opaque_data);
     189              : static int  SlruSelectLRUPage(SlruDesc *ctl, int64 pageno);
     190              : 
     191              : static bool SlruScanDirCbDeleteCutoff(SlruDesc *ctl, char *filename,
     192              :                                       int64 segpage, void *data);
     193              : static void SlruInternalDeleteSegment(SlruDesc *ctl, int64 segno);
     194              : static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
     195              : 
     196              : 
     197              : /*
     198              :  * Initialization of shared memory
     199              :  */
     200              : 
     201              : static Size
     202         8670 : SimpleLruShmemSize(int nslots, int nlsns)
     203              : {
     204         8670 :     int         nbanks = nslots / SLRU_BANK_SIZE;
     205              :     Size        sz;
     206              : 
     207              :     Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
     208              :     Assert(nslots % SLRU_BANK_SIZE == 0);
     209              : 
     210              :     /* we assume nslots isn't so large as to risk overflow */
     211         8670 :     sz = MAXALIGN(sizeof(SlruSharedData));
     212         8670 :     sz += MAXALIGN(nslots * sizeof(char *));    /* page_buffer[] */
     213         8670 :     sz += MAXALIGN(nslots * sizeof(SlruPageStatus));    /* page_status[] */
     214         8670 :     sz += MAXALIGN(nslots * sizeof(bool));  /* page_dirty[] */
     215         8670 :     sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
     216         8670 :     sz += MAXALIGN(nslots * sizeof(int));   /* page_lru_count[] */
     217         8670 :     sz += MAXALIGN(nslots * sizeof(LWLockPadded));  /* buffer_locks[] */
     218         8670 :     sz += MAXALIGN(nbanks * sizeof(LWLockPadded));  /* bank_locks[] */
     219         8670 :     sz += MAXALIGN(nbanks * sizeof(int));   /* bank_cur_lru_count[] */
     220              : 
     221         8670 :     if (nlsns > 0)
     222         1238 :         sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));    /* group_lsn[] */
     223              : 
     224         8670 :     return BUFFERALIGN(sz) + BLCKSZ * nslots;
     225              : }
     226              : 
     227              : /*
     228              :  * Determine a number of SLRU buffers to use.
     229              :  *
     230              :  * We simply divide shared_buffers by the divisor given and cap
     231              :  * that at the maximum given; but always at least SLRU_BANK_SIZE.
     232              :  * Round down to the nearest multiple of SLRU_BANK_SIZE.
     233              :  */
     234              : int
     235         3692 : SimpleLruAutotuneBuffers(int divisor, int max)
     236              : {
     237         3692 :     return Min(max - (max % SLRU_BANK_SIZE),
     238              :                Max(SLRU_BANK_SIZE,
     239              :                    NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE));
     240              : }
     241              : 
     242              : /*
     243              :  * Register a simple LRU cache in shared memory.
     244              :  */
     245              : void
     246         8670 : SimpleLruRequestWithOpts(const SlruOpts *options)
     247              : {
     248              :     SlruOpts   *options_copy;
     249              : 
     250              :     Assert(options->name != NULL);
     251              :     Assert(options->nslots > 0);
     252              :     Assert(options->PagePrecedes != NULL);
     253              :     Assert(options->errdetail_for_io_error != NULL);
     254              : 
     255         8670 :     options_copy = MemoryContextAlloc(TopMemoryContext,
     256              :                                       sizeof(SlruOpts));
     257         8670 :     memcpy(options_copy, options, sizeof(SlruOpts));
     258              : 
     259         8670 :     options_copy->base.name = options->name;
     260         8670 :     options_copy->base.size = SimpleLruShmemSize(options_copy->nslots, options_copy->nlsns);
     261              : 
     262         8670 :     ShmemRequestInternal(&options_copy->base, SHMEM_KIND_SLRU);
     263         8670 : }
     264              : 
     265              : /* Initialize locks and shared memory area */
     266              : void
     267         8649 : shmem_slru_init(void *location, ShmemStructOpts *base_options)
     268              : {
     269         8649 :     SlruOpts   *options = (SlruOpts *) base_options;
     270         8649 :     SlruDesc   *desc = (SlruDesc *) options->desc;
     271              :     char        namebuf[NAMEDATALEN];
     272              :     SlruShared  shared;
     273         8649 :     int         nslots = options->nslots;
     274         8649 :     int         nbanks = nslots / SLRU_BANK_SIZE;
     275         8649 :     int         nlsns = options->nlsns;
     276              :     char       *ptr;
     277              :     Size        offset;
     278              : 
     279         8649 :     shared = (SlruShared) location;
     280         8649 :     desc->shared = shared;
     281         8649 :     desc->nbanks = nbanks;
     282         8649 :     memcpy(&desc->options, options, sizeof(SlruOpts));
     283              : 
     284              :     /* assign new tranche IDs, if not given */
     285         8649 :     if (desc->options.buffer_tranche_id == 0)
     286              :     {
     287            4 :         snprintf(namebuf, sizeof(namebuf), "%s buffer", desc->options.name);
     288            4 :         desc->options.buffer_tranche_id = LWLockNewTrancheId(namebuf);
     289              :     }
     290         8649 :     if (desc->options.bank_tranche_id == 0)
     291              :     {
     292            4 :         snprintf(namebuf, sizeof(namebuf), "%s bank", desc->options.name);
     293            4 :         desc->options.bank_tranche_id = LWLockNewTrancheId(namebuf);
     294              :     }
     295              : 
     296              :     Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
     297              : 
     298         8649 :     memset(shared, 0, sizeof(SlruSharedData));
     299              : 
     300         8649 :     shared->num_slots = nslots;
     301         8649 :     shared->lsn_groups_per_page = nlsns;
     302              : 
     303         8649 :     pg_atomic_init_u64(&shared->latest_page_number, 0);
     304              : 
     305         8649 :     shared->slru_stats_idx = pgstat_get_slru_index(desc->options.name);
     306              : 
     307         8649 :     ptr = (char *) shared;
     308         8649 :     offset = MAXALIGN(sizeof(SlruSharedData));
     309         8649 :     shared->page_buffer = (char **) (ptr + offset);
     310         8649 :     offset += MAXALIGN(nslots * sizeof(char *));
     311         8649 :     shared->page_status = (SlruPageStatus *) (ptr + offset);
     312         8649 :     offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
     313         8649 :     shared->page_dirty = (bool *) (ptr + offset);
     314         8649 :     offset += MAXALIGN(nslots * sizeof(bool));
     315         8649 :     shared->page_number = (int64 *) (ptr + offset);
     316         8649 :     offset += MAXALIGN(nslots * sizeof(int64));
     317         8649 :     shared->page_lru_count = (int *) (ptr + offset);
     318         8649 :     offset += MAXALIGN(nslots * sizeof(int));
     319              : 
     320              :     /* Initialize LWLocks */
     321         8649 :     shared->buffer_locks = (LWLockPadded *) (ptr + offset);
     322         8649 :     offset += MAXALIGN(nslots * sizeof(LWLockPadded));
     323         8649 :     shared->bank_locks = (LWLockPadded *) (ptr + offset);
     324         8649 :     offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
     325         8649 :     shared->bank_cur_lru_count = (int *) (ptr + offset);
     326         8649 :     offset += MAXALIGN(nbanks * sizeof(int));
     327              : 
     328         8649 :     if (nlsns > 0)
     329              :     {
     330         1235 :         shared->group_lsn = (XLogRecPtr *) (ptr + offset);
     331         1235 :         offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
     332              :     }
     333              : 
     334         8649 :     ptr += BUFFERALIGN(offset);
     335       221129 :     for (int slotno = 0; slotno < nslots; slotno++)
     336              :     {
     337       212480 :         LWLockInitialize(&shared->buffer_locks[slotno].lock,
     338              :                          desc->options.buffer_tranche_id);
     339              : 
     340       212480 :         shared->page_buffer[slotno] = ptr;
     341       212480 :         shared->page_status[slotno] = SLRU_PAGE_EMPTY;
     342       212480 :         shared->page_dirty[slotno] = false;
     343       212480 :         shared->page_lru_count[slotno] = 0;
     344       212480 :         ptr += BLCKSZ;
     345              :     }
     346              : 
     347              :     /* Initialize the slot banks. */
     348        21929 :     for (int bankno = 0; bankno < nbanks; bankno++)
     349              :     {
     350        13280 :         LWLockInitialize(&shared->bank_locks[bankno].lock, desc->options.bank_tranche_id);
     351        13280 :         shared->bank_cur_lru_count[bankno] = 0;
     352              :     }
     353              : 
     354              :     /* Should fit to estimated shmem size */
     355              :     Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
     356         8649 : }
     357              : 
     358              : void
     359            0 : shmem_slru_attach(void *location, ShmemStructOpts *base_options)
     360              : {
     361            0 :     SlruOpts   *options = (SlruOpts *) base_options;
     362            0 :     SlruDesc   *desc = (SlruDesc *) options->desc;
     363            0 :     int         nslots = options->nslots;
     364            0 :     int         nbanks = nslots / SLRU_BANK_SIZE;
     365              : 
     366            0 :     desc->shared = (SlruShared) location;
     367            0 :     desc->nbanks = nbanks;
     368            0 :     memcpy(&desc->options, options, sizeof(SlruOpts));
     369            0 : }
     370              : 
     371              : 
     372              : /*
     373              :  * Helper function for GUC check_hook to check whether slru buffers are in
     374              :  * multiples of SLRU_BANK_SIZE.
     375              :  */
     376              : bool
     377        12669 : check_slru_buffers(const char *name, int *newval)
     378              : {
     379              :     /* Valid values are multiples of SLRU_BANK_SIZE */
     380        12669 :     if (*newval % SLRU_BANK_SIZE == 0)
     381        12669 :         return true;
     382              : 
     383            0 :     GUC_check_errdetail("\"%s\" must be a multiple of %d.", name,
     384              :                         SLRU_BANK_SIZE);
     385            0 :     return false;
     386              : }
     387              : 
     388              : /*
     389              :  * Initialize (or reinitialize) a page to zeroes.
     390              :  *
     391              :  * The page is not actually written, just set up in shared memory.
     392              :  * The slot number of the new page is returned.
     393              :  *
     394              :  * Bank lock must be held at entry, and will be held at exit.
     395              :  */
     396              : int
     397      7346014 : SimpleLruZeroPage(SlruDesc *ctl, int64 pageno)
     398              : {
     399      7346014 :     SlruShared  shared = ctl->shared;
     400              :     int         slotno;
     401              : 
     402              :     Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
     403              : 
     404              :     /* Find a suitable buffer slot for the page */
     405      7346014 :     slotno = SlruSelectLRUPage(ctl, pageno);
     406              :     Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
     407              :            (shared->page_status[slotno] == SLRU_PAGE_VALID &&
     408              :             !shared->page_dirty[slotno]) ||
     409              :            shared->page_number[slotno] == pageno);
     410              : 
     411              :     /* Mark the slot as containing this page */
     412      7346014 :     shared->page_number[slotno] = pageno;
     413      7346014 :     shared->page_status[slotno] = SLRU_PAGE_VALID;
     414      7346014 :     shared->page_dirty[slotno] = true;
     415      7346014 :     SlruRecentlyUsed(shared, slotno);
     416              : 
     417              :     /* Set the buffer to zeroes */
     418      7346014 :     MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
     419              : 
     420              :     /* Set the LSNs for this new page to zero */
     421      7346014 :     SimpleLruZeroLSNs(ctl, slotno);
     422              : 
     423              :     /*
     424              :      * Assume this page is now the latest active page.
     425              :      *
     426              :      * Note that because both this routine and SlruSelectLRUPage run with a
     427              :      * SLRU bank lock held, it is not possible for this to be zeroing a page
     428              :      * that SlruSelectLRUPage is going to evict simultaneously.  Therefore,
     429              :      * there's no memory barrier here.
     430              :      */
     431      7346014 :     pg_atomic_write_u64(&shared->latest_page_number, pageno);
     432              : 
     433              :     /* update the stats counter of zeroed pages */
     434      7346014 :     pgstat_count_slru_blocks_zeroed(shared->slru_stats_idx);
     435              : 
     436      7346014 :     return slotno;
     437              : }
     438              : 
     439              : /*
     440              :  * Zero all the LSNs we store for this slru page.
     441              :  *
     442              :  * This should be called each time we create a new page, and each time we read
     443              :  * in a page from disk into an existing buffer.  (Such an old page cannot
     444              :  * have any interesting LSNs, since we'd have flushed them before writing
     445              :  * the page in the first place.)
     446              :  *
     447              :  * This assumes that InvalidXLogRecPtr is bitwise-all-0.
     448              :  */
     449              : static void
     450      7363644 : SimpleLruZeroLSNs(SlruDesc *ctl, int slotno)
     451              : {
     452      7363644 :     SlruShared  shared = ctl->shared;
     453              : 
     454      7363644 :     if (shared->lsn_groups_per_page > 0)
     455       433306 :         MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
     456              :                shared->lsn_groups_per_page * sizeof(XLogRecPtr));
     457      7363644 : }
     458              : 
     459              : /*
     460              :  * This is a convenience wrapper for the common case of zeroing a page and
     461              :  * immediately flushing it to disk.
     462              :  *
     463              :  * SLRU bank lock is acquired and released here.
     464              :  */
     465              : void
     466          242 : SimpleLruZeroAndWritePage(SlruDesc *ctl, int64 pageno)
     467              : {
     468              :     int         slotno;
     469              :     LWLock     *lock;
     470              : 
     471          242 :     lock = SimpleLruGetBankLock(ctl, pageno);
     472          242 :     LWLockAcquire(lock, LW_EXCLUSIVE);
     473              : 
     474              :     /* Create and zero the page */
     475          242 :     slotno = SimpleLruZeroPage(ctl, pageno);
     476              : 
     477              :     /* Make sure it's written out */
     478          242 :     SimpleLruWritePage(ctl, slotno);
     479              :     Assert(!ctl->shared->page_dirty[slotno]);
     480              : 
     481          242 :     LWLockRelease(lock);
     482          242 : }
     483              : 
     484              : /*
     485              :  * Wait for any active I/O on a page slot to finish.  (This does not
     486              :  * guarantee that new I/O hasn't been started before we return, though.
     487              :  * In fact the slot might not even contain the same page anymore.)
     488              :  *
     489              :  * Bank lock must be held at entry, and will be held at exit.
     490              :  */
     491              : static void
     492            2 : SimpleLruWaitIO(SlruDesc *ctl, int slotno)
     493              : {
     494            2 :     SlruShared  shared = ctl->shared;
     495            2 :     int         bankno = SlotGetBankNumber(slotno);
     496              : 
     497              :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     498              : 
     499              :     /* See notes at top of file */
     500            2 :     LWLockRelease(&shared->bank_locks[bankno].lock);
     501            2 :     LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
     502            2 :     LWLockRelease(&shared->buffer_locks[slotno].lock);
     503            2 :     LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
     504              : 
     505              :     /*
     506              :      * If the slot is still in an io-in-progress state, then either someone
     507              :      * already started a new I/O on the slot, or a previous I/O failed and
     508              :      * neglected to reset the page state.  That shouldn't happen, really, but
     509              :      * it seems worth a few extra cycles to check and recover from it. We can
     510              :      * cheaply test for failure by seeing if the buffer lock is still held (we
     511              :      * assume that transaction abort would release the lock).
     512              :      */
     513            2 :     if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
     514            2 :         shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
     515              :     {
     516            0 :         if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
     517              :         {
     518              :             /* indeed, the I/O must have failed */
     519            0 :             if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
     520            0 :                 shared->page_status[slotno] = SLRU_PAGE_EMPTY;
     521              :             else                /* write_in_progress */
     522              :             {
     523            0 :                 shared->page_status[slotno] = SLRU_PAGE_VALID;
     524            0 :                 shared->page_dirty[slotno] = true;
     525              :             }
     526            0 :             LWLockRelease(&shared->buffer_locks[slotno].lock);
     527              :         }
     528              :     }
     529            2 : }
     530              : 
     531              : /*
     532              :  * Find a page in a shared buffer, reading it in if necessary.
     533              :  * The page number must correspond to an already-initialized page.
     534              :  *
     535              :  * If write_ok is true then it is OK to return a page that is in
     536              :  * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
     537              :  * that modification of the page is safe.  If write_ok is false then we
     538              :  * will not return the page until it is not undergoing active I/O.
     539              :  *
     540              :  * On error, the passed-in 'opaque_data' is passed to the
     541              :  * 'errdetail_for_io_error' callback, to provide details on the operation that
     542              :  * failed.  It is only used for error reporting.
     543              :  *
     544              :  * Return value is the shared-buffer slot number now holding the page.
     545              :  * The buffer's LRU access info is updated.
     546              :  *
     547              :  * The correct bank lock must be held at entry, and will be held at exit.
     548              :  */
     549              : int
     550       396634 : SimpleLruReadPage(SlruDesc *ctl, int64 pageno, bool write_ok,
     551              :                   const void *opaque_data)
     552              : {
     553       396634 :     SlruShared  shared = ctl->shared;
     554       396634 :     LWLock     *banklock = SimpleLruGetBankLock(ctl, pageno);
     555              : 
     556              :     Assert(LWLockHeldByMeInMode(banklock, LW_EXCLUSIVE));
     557              : 
     558              :     /* Outer loop handles restart if we must wait for someone else's I/O */
     559              :     for (;;)
     560            1 :     {
     561              :         int         slotno;
     562              :         bool        ok;
     563              : 
     564              :         /* See if page already is in memory; if not, pick victim slot */
     565       396635 :         slotno = SlruSelectLRUPage(ctl, pageno);
     566              : 
     567              :         /* Did we find the page in memory? */
     568       396635 :         if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
     569       395503 :             shared->page_number[slotno] == pageno)
     570              :         {
     571              :             /*
     572              :              * If page is still being read in, we must wait for I/O.  Likewise
     573              :              * if the page is being written and the caller said that's not OK.
     574              :              */
     575       379005 :             if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
     576       379005 :                 (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
     577            2 :                  !write_ok))
     578              :             {
     579            1 :                 SimpleLruWaitIO(ctl, slotno);
     580              :                 /* Now we must recheck state from the top */
     581            1 :                 continue;
     582              :             }
     583              :             /* Otherwise, it's ready to use */
     584       379004 :             SlruRecentlyUsed(shared, slotno);
     585              : 
     586              :             /* update the stats counter of pages found in the SLRU */
     587       379004 :             pgstat_count_slru_blocks_hit(shared->slru_stats_idx);
     588              : 
     589       379004 :             return slotno;
     590              :         }
     591              : 
     592              :         /* We found no match; assert we selected a freeable slot */
     593              :         Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
     594              :                (shared->page_status[slotno] == SLRU_PAGE_VALID &&
     595              :                 !shared->page_dirty[slotno]));
     596              : 
     597              :         /* Mark the slot read-busy */
     598        17630 :         shared->page_number[slotno] = pageno;
     599        17630 :         shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
     600        17630 :         shared->page_dirty[slotno] = false;
     601              : 
     602              :         /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
     603        17630 :         LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
     604              : 
     605              :         /* Release bank lock while doing I/O */
     606        17630 :         LWLockRelease(banklock);
     607              : 
     608              :         /* Do the read */
     609        17630 :         ok = SlruPhysicalReadPage(ctl, pageno, slotno);
     610              : 
     611              :         /* Set the LSNs for this newly read-in page to zero */
     612        17630 :         SimpleLruZeroLSNs(ctl, slotno);
     613              : 
     614              :         /* Re-acquire bank control lock and update page state */
     615        17630 :         LWLockAcquire(banklock, LW_EXCLUSIVE);
     616              : 
     617              :         Assert(shared->page_number[slotno] == pageno &&
     618              :                shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
     619              :                !shared->page_dirty[slotno]);
     620              : 
     621        17630 :         shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
     622              : 
     623        17630 :         LWLockRelease(&shared->buffer_locks[slotno].lock);
     624              : 
     625              :         /* Now it's okay to ereport if we failed */
     626        17630 :         if (!ok)
     627            1 :             SlruReportIOError(ctl, pageno, opaque_data);
     628              : 
     629        17629 :         SlruRecentlyUsed(shared, slotno);
     630              : 
     631              :         /* update the stats counter of pages not found in SLRU */
     632        17629 :         pgstat_count_slru_blocks_read(shared->slru_stats_idx);
     633              : 
     634        17629 :         return slotno;
     635              :     }
     636              : }
     637              : 
     638              : /*
     639              :  * Find a page in a shared buffer, reading it in if necessary.
     640              :  * The page number must correspond to an already-initialized page.
     641              :  * The caller must intend only read-only access to the page.
     642              :  *
     643              :  * On error, the passed-in 'opaque_data' is passed to the
     644              :  * 'errdetail_for_io_error' callback, to provide details on the operation that
     645              :  * failed.  It is only used for error reporting.
     646              :  *
     647              :  * Return value is the shared-buffer slot number now holding the page.
     648              :  * The buffer's LRU access info is updated.
     649              :  *
     650              :  * Bank control lock must NOT be held at entry, but will be held at exit.
     651              :  * It is unspecified whether the lock will be shared or exclusive.
     652              :  */
     653              : int
     654       898382 : SimpleLruReadPage_ReadOnly(SlruDesc *ctl, int64 pageno, const void *opaque_data)
     655              : {
     656       898382 :     SlruShared  shared = ctl->shared;
     657       898382 :     LWLock     *banklock = SimpleLruGetBankLock(ctl, pageno);
     658       898382 :     int         bankno = pageno % ctl->nbanks;
     659       898382 :     int         bankstart = bankno * SLRU_BANK_SIZE;
     660       898382 :     int         bankend = bankstart + SLRU_BANK_SIZE;
     661              : 
     662              :     /* Try to find the page while holding only shared lock */
     663       898382 :     LWLockAcquire(banklock, LW_SHARED);
     664              : 
     665              :     /* See if page is already in a buffer */
     666       904923 :     for (int slotno = bankstart; slotno < bankend; slotno++)
     667              :     {
     668       904704 :         if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
     669       903681 :             shared->page_number[slotno] == pageno &&
     670       898163 :             shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
     671              :         {
     672              :             /* See comments for SlruRecentlyUsed() */
     673       898163 :             SlruRecentlyUsed(shared, slotno);
     674              : 
     675              :             /* update the stats counter of pages found in the SLRU */
     676       898163 :             pgstat_count_slru_blocks_hit(shared->slru_stats_idx);
     677              : 
     678       898163 :             return slotno;
     679              :         }
     680              :     }
     681              : 
     682              :     /* No luck, so switch to normal exclusive lock and do regular read */
     683          219 :     LWLockRelease(banklock);
     684          219 :     LWLockAcquire(banklock, LW_EXCLUSIVE);
     685              : 
     686          219 :     return SimpleLruReadPage(ctl, pageno, true, opaque_data);
     687              : }
     688              : 
     689              : /*
     690              :  * Write a page from a shared buffer, if necessary.
     691              :  * Does nothing if the specified slot is not dirty.
     692              :  *
     693              :  * NOTE: only one write attempt is made here.  Hence, it is possible that
     694              :  * the page is still dirty at exit (if someone else re-dirtied it during
     695              :  * the write).  However, we *do* attempt a fresh write even if the page
     696              :  * is already being written; this is for checkpoints.
     697              :  *
     698              :  * Bank lock must be held at entry, and will be held at exit.
     699              :  */
     700              : static void
     701      7350274 : SlruInternalWritePage(SlruDesc *ctl, int slotno, SlruWriteAll fdata)
     702              : {
     703      7350274 :     SlruShared  shared = ctl->shared;
     704      7350274 :     int64       pageno = shared->page_number[slotno];
     705      7350274 :     int         bankno = SlotGetBankNumber(slotno);
     706              :     bool        ok;
     707              : 
     708              :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     709              :     Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
     710              : 
     711              :     /* If a write is in progress, wait for it to finish */
     712      7350275 :     while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
     713            1 :            shared->page_number[slotno] == pageno)
     714              :     {
     715            1 :         SimpleLruWaitIO(ctl, slotno);
     716              :     }
     717              : 
     718              :     /*
     719              :      * Do nothing if page is not dirty, or if buffer no longer contains the
     720              :      * same page we were called for.
     721              :      */
     722      7350274 :     if (!shared->page_dirty[slotno] ||
     723      7346772 :         shared->page_status[slotno] != SLRU_PAGE_VALID ||
     724      7346772 :         shared->page_number[slotno] != pageno)
     725         3502 :         return;
     726              : 
     727              :     /*
     728              :      * Mark the slot write-busy, and clear the dirtybit.  After this point, a
     729              :      * transaction status update on this page will mark it dirty again.
     730              :      */
     731      7346772 :     shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
     732      7346772 :     shared->page_dirty[slotno] = false;
     733              : 
     734              :     /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
     735      7346772 :     LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
     736              : 
     737              :     /* Release bank lock while doing I/O */
     738      7346772 :     LWLockRelease(&shared->bank_locks[bankno].lock);
     739              : 
     740              :     /* Do the write */
     741      7346772 :     ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
     742              : 
     743              :     /* If we failed, and we're in a flush, better close the files */
     744      7346772 :     if (!ok && fdata)
     745              :     {
     746            0 :         for (int i = 0; i < fdata->num_files; i++)
     747            0 :             CloseTransientFile(fdata->fd[i]);
     748              :     }
     749              : 
     750              :     /* Re-acquire bank lock and update page state */
     751      7346772 :     LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
     752              : 
     753              :     Assert(shared->page_number[slotno] == pageno &&
     754              :            shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
     755              : 
     756              :     /* If we failed to write, mark the page dirty again */
     757      7346772 :     if (!ok)
     758            0 :         shared->page_dirty[slotno] = true;
     759              : 
     760      7346772 :     shared->page_status[slotno] = SLRU_PAGE_VALID;
     761              : 
     762      7346772 :     LWLockRelease(&shared->buffer_locks[slotno].lock);
     763              : 
     764              :     /* Now it's okay to ereport if we failed */
     765      7346772 :     if (!ok)
     766            0 :         SlruReportIOError(ctl, pageno, NULL);
     767              : 
     768              :     /* If part of a checkpoint, count this as a SLRU buffer written. */
     769      7346772 :     if (fdata)
     770              :     {
     771         3160 :         CheckpointStats.ckpt_slru_written++;
     772         3160 :         PendingCheckpointerStats.slru_written++;
     773              :     }
     774              : }
     775              : 
     776              : /*
     777              :  * Wrapper of SlruInternalWritePage, for external callers.
     778              :  * fdata is always passed a NULL here.
     779              :  */
     780              : void
     781          340 : SimpleLruWritePage(SlruDesc *ctl, int slotno)
     782              : {
     783              :     Assert(ctl->shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     784              : 
     785          340 :     SlruInternalWritePage(ctl, slotno, NULL);
     786          340 : }
     787              : 
     788              : /*
     789              :  * Return whether the given page exists on disk.
     790              :  *
     791              :  * A false return means that either the file does not exist, or that it's not
     792              :  * large enough to contain the given page.
     793              :  */
     794              : bool
     795           66 : SimpleLruDoesPhysicalPageExist(SlruDesc *ctl, int64 pageno)
     796              : {
     797           66 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     798           66 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     799           66 :     int         offset = rpageno * BLCKSZ;
     800              :     char        path[MAXPGPATH];
     801              :     int         fd;
     802              :     bool        result;
     803              :     off_t       endpos;
     804              : 
     805              :     /* update the stats counter of checked pages */
     806           66 :     pgstat_count_slru_blocks_exists(ctl->shared->slru_stats_idx);
     807              : 
     808           66 :     SlruFileName(ctl, path, segno);
     809              : 
     810           66 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
     811           66 :     if (fd < 0)
     812              :     {
     813              :         /* expected: file doesn't exist */
     814           26 :         if (errno == ENOENT)
     815           26 :             return false;
     816              : 
     817              :         /* report error normally */
     818            0 :         slru_errcause = SLRU_OPEN_FAILED;
     819            0 :         slru_errno = errno;
     820            0 :         SlruReportIOError(ctl, pageno, NULL);
     821              :     }
     822              : 
     823           40 :     if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
     824              :     {
     825            0 :         slru_errcause = SLRU_SEEK_FAILED;
     826            0 :         slru_errno = errno;
     827            0 :         SlruReportIOError(ctl, pageno, NULL);
     828              :     }
     829              : 
     830           40 :     result = endpos >= (off_t) (offset + BLCKSZ);
     831              : 
     832           40 :     if (CloseTransientFile(fd) != 0)
     833              :     {
     834            0 :         slru_errcause = SLRU_CLOSE_FAILED;
     835            0 :         slru_errno = errno;
     836            0 :         return false;
     837              :     }
     838              : 
     839           40 :     return result;
     840              : }
     841              : 
     842              : /*
     843              :  * Physical read of a (previously existing) page into a buffer slot
     844              :  *
     845              :  * On failure, we cannot just ereport(ERROR) since caller has put state in
     846              :  * shared memory that must be undone.  So, we return false and save enough
     847              :  * info in static variables to let SlruReportIOError make the report.
     848              :  *
     849              :  * For now, assume it's not worth keeping a file pointer open across
     850              :  * read/write operations.  We could cache one virtual file pointer ...
     851              :  */
     852              : static bool
     853        17630 : SlruPhysicalReadPage(SlruDesc *ctl, int64 pageno, int slotno)
     854              : {
     855        17630 :     SlruShared  shared = ctl->shared;
     856        17630 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     857        17630 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     858        17630 :     off_t       offset = rpageno * BLCKSZ;
     859              :     char        path[MAXPGPATH];
     860              :     int         fd;
     861              : 
     862        17630 :     SlruFileName(ctl, path, segno);
     863              : 
     864              :     /*
     865              :      * In a crash-and-restart situation, it's possible for us to receive
     866              :      * commands to set the commit status of transactions whose bits are in
     867              :      * already-truncated segments of the commit log (see notes in
     868              :      * SlruPhysicalWritePage).  Hence, if we are InRecovery, allow the case
     869              :      * where the file doesn't exist, and return zeroes instead.
     870              :      */
     871        17630 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
     872        17630 :     if (fd < 0)
     873              :     {
     874            1 :         if (errno != ENOENT || !InRecovery)
     875              :         {
     876            1 :             slru_errcause = SLRU_OPEN_FAILED;
     877            1 :             slru_errno = errno;
     878            1 :             return false;
     879              :         }
     880              : 
     881            0 :         ereport(LOG,
     882              :                 (errmsg("file \"%s\" doesn't exist, reading as zeroes",
     883              :                         path)));
     884            0 :         MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
     885            0 :         return true;
     886              :     }
     887              : 
     888        17629 :     errno = 0;
     889        17629 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
     890        17629 :     if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
     891              :     {
     892            0 :         pgstat_report_wait_end();
     893            0 :         slru_errcause = SLRU_READ_FAILED;
     894            0 :         slru_errno = errno;
     895            0 :         CloseTransientFile(fd);
     896            0 :         return false;
     897              :     }
     898        17629 :     pgstat_report_wait_end();
     899              : 
     900        17629 :     if (CloseTransientFile(fd) != 0)
     901              :     {
     902            0 :         slru_errcause = SLRU_CLOSE_FAILED;
     903            0 :         slru_errno = errno;
     904            0 :         return false;
     905              :     }
     906              : 
     907        17629 :     return true;
     908              : }
     909              : 
     910              : /*
     911              :  * Physical write of a page from a buffer slot
     912              :  *
     913              :  * On failure, we cannot just ereport(ERROR) since caller has put state in
     914              :  * shared memory that must be undone.  So, we return false and save enough
     915              :  * info in static variables to let SlruReportIOError make the report.
     916              :  *
     917              :  * For now, assume it's not worth keeping a file pointer open across
     918              :  * independent read/write operations.  We do batch operations during
     919              :  * SimpleLruWriteAll, though.
     920              :  *
     921              :  * fdata is NULL for a standalone write, pointer to open-file info during
     922              :  * SimpleLruWriteAll.
     923              :  */
     924              : static bool
     925      7346772 : SlruPhysicalWritePage(SlruDesc *ctl, int64 pageno, int slotno, SlruWriteAll fdata)
     926              : {
     927      7346772 :     SlruShared  shared = ctl->shared;
     928      7346772 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     929      7346772 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     930      7346772 :     off_t       offset = rpageno * BLCKSZ;
     931              :     char        path[MAXPGPATH];
     932      7346772 :     int         fd = -1;
     933              : 
     934              :     /* update the stats counter of written pages */
     935      7346772 :     pgstat_count_slru_blocks_written(shared->slru_stats_idx);
     936              : 
     937              :     /*
     938              :      * Honor the write-WAL-before-data rule, if appropriate, so that we do not
     939              :      * write out data before associated WAL records.  This is the same action
     940              :      * performed during FlushBuffer() in the main buffer manager.
     941              :      */
     942      7346772 :     if (shared->group_lsn != NULL)
     943              :     {
     944              :         /*
     945              :          * We must determine the largest async-commit LSN for the page. This
     946              :          * is a bit tedious, but since this entire function is a slow path
     947              :          * anyway, it seems better to do this here than to maintain a per-page
     948              :          * LSN variable (which'd need an extra comparison in the
     949              :          * transaction-commit path).
     950              :          */
     951              :         XLogRecPtr  max_lsn;
     952              :         int         lsnindex;
     953              : 
     954       433456 :         lsnindex = slotno * shared->lsn_groups_per_page;
     955       433456 :         max_lsn = shared->group_lsn[lsnindex++];
     956    443858944 :         for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
     957              :         {
     958    443425488 :             XLogRecPtr  this_lsn = shared->group_lsn[lsnindex++];
     959              : 
     960    443425488 :             if (max_lsn < this_lsn)
     961        55676 :                 max_lsn = this_lsn;
     962              :         }
     963              : 
     964       433456 :         if (XLogRecPtrIsValid(max_lsn))
     965              :         {
     966              :             /*
     967              :              * As noted above, elog(ERROR) is not acceptable here, so if
     968              :              * XLogFlush were to fail, we must PANIC.  This isn't much of a
     969              :              * restriction because XLogFlush is just about all critical
     970              :              * section anyway, but let's make sure.
     971              :              */
     972          564 :             START_CRIT_SECTION();
     973          564 :             XLogFlush(max_lsn);
     974          564 :             END_CRIT_SECTION();
     975              :         }
     976              :     }
     977              : 
     978              :     /*
     979              :      * During a SimpleLruWriteAll, we may already have the desired file open.
     980              :      */
     981      7346772 :     if (fdata)
     982              :     {
     983         3244 :         for (int i = 0; i < fdata->num_files; i++)
     984              :         {
     985          323 :             if (fdata->segno[i] == segno)
     986              :             {
     987          239 :                 fd = fdata->fd[i];
     988          239 :                 break;
     989              :             }
     990              :         }
     991              :     }
     992              : 
     993      7346772 :     if (fd < 0)
     994              :     {
     995              :         /*
     996              :          * If the file doesn't already exist, we should create it.  It is
     997              :          * possible for this to need to happen when writing a page that's not
     998              :          * first in its segment; we assume the OS can cope with that. (Note:
     999              :          * it might seem that it'd be okay to create files only when
    1000              :          * SimpleLruZeroPage is called for the first page of a segment.
    1001              :          * However, if after a crash and restart the REDO logic elects to
    1002              :          * replay the log from a checkpoint before the latest one, then it's
    1003              :          * possible that we will get commands to set transaction status of
    1004              :          * transactions that have already been truncated from the commit log.
    1005              :          * Easiest way to deal with that is to accept references to
    1006              :          * nonexistent files here and in SlruPhysicalReadPage.)
    1007              :          *
    1008              :          * Note: it is possible for more than one backend to be executing this
    1009              :          * code simultaneously for different pages of the same file. Hence,
    1010              :          * don't use O_EXCL or O_TRUNC or anything like that.
    1011              :          */
    1012      7346533 :         SlruFileName(ctl, path, segno);
    1013      7346533 :         fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
    1014      7346533 :         if (fd < 0)
    1015              :         {
    1016            0 :             slru_errcause = SLRU_OPEN_FAILED;
    1017            0 :             slru_errno = errno;
    1018            0 :             return false;
    1019              :         }
    1020              : 
    1021      7346533 :         if (fdata)
    1022              :         {
    1023         2921 :             if (fdata->num_files < MAX_WRITEALL_BUFFERS)
    1024              :             {
    1025         2921 :                 fdata->fd[fdata->num_files] = fd;
    1026         2921 :                 fdata->segno[fdata->num_files] = segno;
    1027         2921 :                 fdata->num_files++;
    1028              :             }
    1029              :             else
    1030              :             {
    1031              :                 /*
    1032              :                  * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
    1033              :                  * fall back to treating it as a standalone write.
    1034              :                  */
    1035            0 :                 fdata = NULL;
    1036              :             }
    1037              :         }
    1038              :     }
    1039              : 
    1040      7346772 :     errno = 0;
    1041      7346772 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
    1042      7346772 :     if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
    1043              :     {
    1044            0 :         pgstat_report_wait_end();
    1045              :         /* if write didn't set errno, assume problem is no disk space */
    1046            0 :         if (errno == 0)
    1047            0 :             errno = ENOSPC;
    1048            0 :         slru_errcause = SLRU_WRITE_FAILED;
    1049            0 :         slru_errno = errno;
    1050            0 :         if (!fdata)
    1051            0 :             CloseTransientFile(fd);
    1052            0 :         return false;
    1053              :     }
    1054      7346772 :     pgstat_report_wait_end();
    1055              : 
    1056              :     /* Queue up a sync request for the checkpointer. */
    1057      7346772 :     if (ctl->options.sync_handler != SYNC_HANDLER_NONE)
    1058              :     {
    1059              :         FileTag     tag;
    1060              : 
    1061       434410 :         INIT_SLRUFILETAG(tag, ctl->options.sync_handler, segno);
    1062       434410 :         if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
    1063              :         {
    1064              :             /* No space to enqueue sync request.  Do it synchronously. */
    1065            2 :             pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
    1066            2 :             if (pg_fsync(fd) != 0)
    1067              :             {
    1068            0 :                 pgstat_report_wait_end();
    1069            0 :                 slru_errcause = SLRU_FSYNC_FAILED;
    1070            0 :                 slru_errno = errno;
    1071            0 :                 CloseTransientFile(fd);
    1072            0 :                 return false;
    1073              :             }
    1074            2 :             pgstat_report_wait_end();
    1075              :         }
    1076              :     }
    1077              : 
    1078              :     /* Close file, unless part of flush request. */
    1079      7346772 :     if (!fdata)
    1080              :     {
    1081      7343612 :         if (CloseTransientFile(fd) != 0)
    1082              :         {
    1083            0 :             slru_errcause = SLRU_CLOSE_FAILED;
    1084            0 :             slru_errno = errno;
    1085            0 :             return false;
    1086              :         }
    1087              :     }
    1088              : 
    1089      7346772 :     return true;
    1090              : }
    1091              : 
    1092              : /*
    1093              :  * Issue the error message after failure of SlruPhysicalReadPage or
    1094              :  * SlruPhysicalWritePage.  Call this after cleaning up shared-memory state.
    1095              :  */
    1096              : static void
    1097            1 : SlruReportIOError(SlruDesc *ctl, int64 pageno, const void *opaque_data)
    1098              : {
    1099            1 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
    1100            1 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
    1101            1 :     int         offset = rpageno * BLCKSZ;
    1102              :     char        path[MAXPGPATH];
    1103              : 
    1104            1 :     SlruFileName(ctl, path, segno);
    1105            1 :     errno = slru_errno;
    1106            1 :     switch (slru_errcause)
    1107              :     {
    1108            1 :         case SLRU_OPEN_FAILED:
    1109            1 :             ereport(ERROR,
    1110              :                     (errcode_for_file_access(),
    1111              :                      errmsg("could not open file \"%s\": %m", path),
    1112              :                      opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
    1113              :             break;
    1114            0 :         case SLRU_SEEK_FAILED:
    1115            0 :             ereport(ERROR,
    1116              :                     (errcode_for_file_access(),
    1117              :                      errmsg("could not seek in file \"%s\" to offset %d: %m",
    1118              :                             path, offset),
    1119              :                      opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
    1120              :             break;
    1121            0 :         case SLRU_READ_FAILED:
    1122            0 :             if (errno)
    1123            0 :                 ereport(ERROR,
    1124              :                         (errcode_for_file_access(),
    1125              :                          errmsg("could not read from file \"%s\" at offset %d: %m",
    1126              :                                 path, offset),
    1127              :                          opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
    1128              :             else
    1129            0 :                 ereport(ERROR,
    1130              :                         (errmsg("could not read from file \"%s\" at offset %d: read too few bytes",
    1131              :                                 path, offset),
    1132              :                          opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
    1133              :             break;
    1134            0 :         case SLRU_WRITE_FAILED:
    1135            0 :             if (errno)
    1136            0 :                 ereport(ERROR,
    1137              :                         (errcode_for_file_access(),
    1138              :                          errmsg("Could not write to file \"%s\" at offset %d: %m",
    1139              :                                 path, offset),
    1140              :                          opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
    1141              :             else
    1142            0 :                 ereport(ERROR,
    1143              :                         (errmsg("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
    1144              :                                 path, offset),
    1145              :                          opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
    1146              :             break;
    1147            0 :         case SLRU_FSYNC_FAILED:
    1148            0 :             ereport(data_sync_elevel(ERROR),
    1149              :                     (errcode_for_file_access(),
    1150              :                      errmsg("could not fsync file \"%s\": %m",
    1151              :                             path),
    1152              :                      opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
    1153            0 :             break;
    1154            0 :         case SLRU_CLOSE_FAILED:
    1155            0 :             ereport(ERROR,
    1156              :                     (errcode_for_file_access(),
    1157              :                      errmsg("could not close file \"%s\": %m",
    1158              :                             path),
    1159              :                      opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
    1160              :             break;
    1161            0 :         default:
    1162              :             /* can't get here, we trust */
    1163            0 :             elog(ERROR, "unrecognized SimpleLru error cause: %d",
    1164              :                  (int) slru_errcause);
    1165              :             break;
    1166              :     }
    1167            0 : }
    1168              : 
    1169              : /*
    1170              :  * Mark a buffer slot "most recently used".
    1171              :  */
    1172              : static inline void
    1173      8640810 : SlruRecentlyUsed(SlruShared shared, int slotno)
    1174              : {
    1175      8640810 :     int         bankno = SlotGetBankNumber(slotno);
    1176      8640810 :     int         new_lru_count = shared->bank_cur_lru_count[bankno];
    1177              : 
    1178              :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
    1179              : 
    1180              :     /*
    1181              :      * The reason for the if-test is that there are often many consecutive
    1182              :      * accesses to the same page (particularly the latest page).  By
    1183              :      * suppressing useless increments of bank_cur_lru_count, we reduce the
    1184              :      * probability that old pages' counts will "wrap around" and make them
    1185              :      * appear recently used.
    1186              :      *
    1187              :      * We allow this code to be executed concurrently by multiple processes
    1188              :      * within SimpleLruReadPage_ReadOnly().  As long as int reads and writes
    1189              :      * are atomic, this should not cause any completely-bogus values to enter
    1190              :      * the computation.  However, it is possible for either bank_cur_lru_count
    1191              :      * or individual page_lru_count entries to be "reset" to lower values than
    1192              :      * they should have, in case a process is delayed while it executes this
    1193              :      * function.  With care in SlruSelectLRUPage(), this does little harm, and
    1194              :      * in any case the absolute worst possible consequence is a nonoptimal
    1195              :      * choice of page to evict.  The gain from allowing concurrent reads of
    1196              :      * SLRU pages seems worth it.
    1197              :      */
    1198      8640810 :     if (new_lru_count != shared->page_lru_count[slotno])
    1199              :     {
    1200      7484186 :         shared->bank_cur_lru_count[bankno] = ++new_lru_count;
    1201      7484186 :         shared->page_lru_count[slotno] = new_lru_count;
    1202              :     }
    1203      8640810 : }
    1204              : 
    1205              : /*
    1206              :  * Select the slot to re-use when we need a free slot for the given page.
    1207              :  *
    1208              :  * The target page number is passed not only because we need to know the
    1209              :  * correct bank to use, but also because we need to consider the possibility
    1210              :  * that some other process reads in the target page while we are doing I/O to
    1211              :  * free a slot.  Hence, check or recheck to see if any slot already holds the
    1212              :  * target page, and return that slot if so.  Thus, the returned slot is
    1213              :  * *either* a slot already holding the pageno (could be any state except
    1214              :  * EMPTY), *or* a freeable slot (state EMPTY or CLEAN).
    1215              :  *
    1216              :  * The correct bank lock must be held at entry, and will be held at exit.
    1217              :  */
    1218              : static int
    1219      7742649 : SlruSelectLRUPage(SlruDesc *ctl, int64 pageno)
    1220              : {
    1221      7742649 :     SlruShared  shared = ctl->shared;
    1222              : 
    1223              :     /* Outer loop handles restart after I/O */
    1224              :     for (;;)
    1225      7343190 :     {
    1226              :         int         cur_count;
    1227     15085839 :         int         bestvalidslot = 0;  /* keep compiler quiet */
    1228     15085839 :         int         best_valid_delta = -1;
    1229     15085839 :         int64       best_valid_page_number = 0; /* keep compiler quiet */
    1230     15085839 :         int         bestinvalidslot = 0;    /* keep compiler quiet */
    1231     15085839 :         int         best_invalid_delta = -1;
    1232     15085839 :         int64       best_invalid_page_number = 0;   /* keep compiler quiet */
    1233     15085839 :         int         bankno = pageno % ctl->nbanks;
    1234     15085839 :         int         bankstart = bankno * SLRU_BANK_SIZE;
    1235     15085839 :         int         bankend = bankstart + SLRU_BANK_SIZE;
    1236              : 
    1237              :         Assert(LWLockHeldByMe(SimpleLruGetBankLock(ctl, pageno)));
    1238              : 
    1239              :         /* See if page already has a buffer assigned */
    1240    251030207 :         for (int slotno = bankstart; slotno < bankend; slotno++)
    1241              :         {
    1242    236323598 :             if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
    1243    236266969 :                 shared->page_number[slotno] == pageno)
    1244       379230 :                 return slotno;
    1245              :         }
    1246              : 
    1247              :         /*
    1248              :          * If we find any EMPTY slot, just select that one. Else choose a
    1249              :          * victim page to replace.  We normally take the least recently used
    1250              :          * valid page, but we will never take the slot containing
    1251              :          * latest_page_number, even if it appears least recently used.  We
    1252              :          * will select a slot that is already I/O busy only if there is no
    1253              :          * other choice: a read-busy slot will not be least recently used once
    1254              :          * the read finishes, and waiting for an I/O on a write-busy slot is
    1255              :          * inferior to just picking some other slot.  Testing shows the slot
    1256              :          * we pick instead will often be clean, allowing us to begin a read at
    1257              :          * once.
    1258              :          *
    1259              :          * Normally the page_lru_count values will all be different and so
    1260              :          * there will be a well-defined LRU page.  But since we allow
    1261              :          * concurrent execution of SlruRecentlyUsed() within
    1262              :          * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
    1263              :          * acquire the same lru_count values.  In that case we break ties by
    1264              :          * choosing the furthest-back page.
    1265              :          *
    1266              :          * Notice that this next line forcibly advances cur_lru_count to a
    1267              :          * value that is certainly beyond any value that will be in the
    1268              :          * page_lru_count array after the loop finishes.  This ensures that
    1269              :          * the next execution of SlruRecentlyUsed will mark the page newly
    1270              :          * used, even if it's for a page that has the current counter value.
    1271              :          * That gets us back on the path to having good data when there are
    1272              :          * multiple pages with the same lru_count.
    1273              :          */
    1274     14706609 :         cur_count = (shared->bank_cur_lru_count[bankno])++;
    1275    249955772 :         for (int slotno = bankstart; slotno < bankend; slotno++)
    1276              :         {
    1277              :             int         this_delta;
    1278              :             int64       this_page_number;
    1279              : 
    1280    235252891 :             if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1281         3728 :                 return slotno;
    1282              : 
    1283    235249163 :             this_delta = cur_count - shared->page_lru_count[slotno];
    1284    235249163 :             if (this_delta < 0)
    1285              :             {
    1286              :                 /*
    1287              :                  * Clean up in case shared updates have caused cur_count
    1288              :                  * increments to get "lost".  We back off the page counts,
    1289              :                  * rather than trying to increase cur_count, to avoid any
    1290              :                  * question of infinite loops or failure in the presence of
    1291              :                  * wrapped-around counts.
    1292              :                  */
    1293            0 :                 shared->page_lru_count[slotno] = cur_count;
    1294            0 :                 this_delta = 0;
    1295              :             }
    1296              : 
    1297              :             /*
    1298              :              * If this page is the one most recently zeroed, don't consider it
    1299              :              * an eviction candidate. See comments in SimpleLruZeroPage for an
    1300              :              * explanation about the lack of a memory barrier here.
    1301              :              */
    1302    235249163 :             this_page_number = shared->page_number[slotno];
    1303    235249163 :             if (this_page_number ==
    1304    235249163 :                 pg_atomic_read_u64(&shared->latest_page_number))
    1305         8914 :                 continue;
    1306              : 
    1307    235240249 :             if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1308              :             {
    1309    235240127 :                 if (this_delta > best_valid_delta ||
    1310            0 :                     (this_delta == best_valid_delta &&
    1311            0 :                      ctl->options.PagePrecedes(this_page_number,
    1312              :                                                best_valid_page_number)))
    1313              :                 {
    1314     32388279 :                     bestvalidslot = slotno;
    1315     32388279 :                     best_valid_delta = this_delta;
    1316     32388279 :                     best_valid_page_number = this_page_number;
    1317              :                 }
    1318              :             }
    1319              :             else
    1320              :             {
    1321          122 :                 if (this_delta > best_invalid_delta ||
    1322            0 :                     (this_delta == best_invalid_delta &&
    1323            0 :                      ctl->options.PagePrecedes(this_page_number,
    1324              :                                                best_invalid_page_number)))
    1325              :                 {
    1326          122 :                     bestinvalidslot = slotno;
    1327          122 :                     best_invalid_delta = this_delta;
    1328          122 :                     best_invalid_page_number = this_page_number;
    1329              :                 }
    1330              :             }
    1331              :         }
    1332              : 
    1333              :         /*
    1334              :          * If all pages (except possibly the latest one) are I/O busy, we'll
    1335              :          * have to wait for an I/O to complete and then retry.  In that
    1336              :          * unhappy case, we choose to wait for the I/O on the least recently
    1337              :          * used slot, on the assumption that it was likely initiated first of
    1338              :          * all the I/Os in progress and may therefore finish first.
    1339              :          */
    1340     14702881 :         if (best_valid_delta < 0)
    1341              :         {
    1342            0 :             SimpleLruWaitIO(ctl, bestinvalidslot);
    1343            0 :             continue;
    1344              :         }
    1345              : 
    1346              :         /*
    1347              :          * If the selected page is clean, we're set.
    1348              :          */
    1349     14702881 :         if (!shared->page_dirty[bestvalidslot])
    1350      7359691 :             return bestvalidslot;
    1351              : 
    1352              :         /*
    1353              :          * Write the page.
    1354              :          */
    1355      7343190 :         SlruInternalWritePage(ctl, bestvalidslot, NULL);
    1356              : 
    1357              :         /*
    1358              :          * Now loop back and try again.  This is the easiest way of dealing
    1359              :          * with corner cases such as the victim page being re-dirtied while we
    1360              :          * wrote it.
    1361              :          */
    1362              :     }
    1363              : }
    1364              : 
    1365              : /*
    1366              :  * Write dirty pages to disk during checkpoint or database shutdown.  Flushing
    1367              :  * is deferred until the next call to ProcessSyncRequests(), though we do fsync
    1368              :  * the containing directory here to make sure that newly created directory
    1369              :  * entries are on disk.
    1370              :  */
    1371              : void
    1372         9737 : SimpleLruWriteAll(SlruDesc *ctl, bool allow_redirtied)
    1373              : {
    1374         9737 :     SlruShared  shared = ctl->shared;
    1375              :     SlruWriteAllData fdata;
    1376         9737 :     int64       pageno = 0;
    1377         9737 :     int         prevbank = SlotGetBankNumber(0);
    1378              :     bool        ok;
    1379              : 
    1380              :     /* update the stats counter of flushes */
    1381         9737 :     pgstat_count_slru_flush(shared->slru_stats_idx);
    1382              : 
    1383              :     /*
    1384              :      * Find and write dirty pages
    1385              :      */
    1386         9737 :     fdata.num_files = 0;
    1387              : 
    1388         9737 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1389              : 
    1390       237913 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1391              :     {
    1392       228176 :         int         curbank = SlotGetBankNumber(slotno);
    1393              : 
    1394              :         /*
    1395              :          * If the current bank lock is not same as the previous bank lock then
    1396              :          * release the previous lock and acquire the new lock.
    1397              :          */
    1398       228176 :         if (curbank != prevbank)
    1399              :         {
    1400         4524 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1401         4524 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1402         4524 :             prevbank = curbank;
    1403              :         }
    1404              : 
    1405              :         /* Do nothing if slot is unused */
    1406       228176 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1407       221514 :             continue;
    1408              : 
    1409         6662 :         SlruInternalWritePage(ctl, slotno, &fdata);
    1410              : 
    1411              :         /*
    1412              :          * In some places (e.g. checkpoints), we cannot assert that the slot
    1413              :          * is clean now, since another process might have re-dirtied it
    1414              :          * already.  That's okay.
    1415              :          */
    1416              :         Assert(allow_redirtied ||
    1417              :                shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
    1418              :                (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1419              :                 !shared->page_dirty[slotno]));
    1420              :     }
    1421              : 
    1422         9737 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1423              : 
    1424              :     /*
    1425              :      * Now close any files that were open
    1426              :      */
    1427         9737 :     ok = true;
    1428        12658 :     for (int i = 0; i < fdata.num_files; i++)
    1429              :     {
    1430         2921 :         if (CloseTransientFile(fdata.fd[i]) != 0)
    1431              :         {
    1432            0 :             slru_errcause = SLRU_CLOSE_FAILED;
    1433            0 :             slru_errno = errno;
    1434            0 :             pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
    1435            0 :             ok = false;
    1436              :         }
    1437              :     }
    1438         9737 :     if (!ok)
    1439            0 :         SlruReportIOError(ctl, pageno, NULL);
    1440              : 
    1441              :     /* Ensure that directory entries for new files are on disk. */
    1442         9737 :     if (ctl->options.sync_handler != SYNC_HANDLER_NONE)
    1443         7796 :         fsync_fname(ctl->options.Dir, true);
    1444         9737 : }
    1445              : 
    1446              : /*
    1447              :  * Remove all segments before the one holding the passed page number
    1448              :  *
    1449              :  * All SLRUs prevent concurrent calls to this function, either with an LWLock
    1450              :  * or by calling it only as part of a checkpoint.  Mutual exclusion must begin
    1451              :  * before computing cutoffPage.  Mutual exclusion must end after any limit
    1452              :  * update that would permit other backends to write fresh data into the
    1453              :  * segment immediately preceding the one containing cutoffPage.  Otherwise,
    1454              :  * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
    1455              :  * after it has accrued freshly-written data.
    1456              :  */
    1457              : void
    1458         2015 : SimpleLruTruncate(SlruDesc *ctl, int64 cutoffPage)
    1459              : {
    1460         2015 :     SlruShared  shared = ctl->shared;
    1461              :     int         prevbank;
    1462              : 
    1463              :     /* update the stats counter of truncates */
    1464         2015 :     pgstat_count_slru_truncate(shared->slru_stats_idx);
    1465              : 
    1466              :     /*
    1467              :      * Scan shared memory and remove any pages preceding the cutoff page, to
    1468              :      * ensure we won't rewrite them later.  (Since this is normally called in
    1469              :      * or just after a checkpoint, any dirty pages should have been flushed
    1470              :      * already ... we're just being extra careful here.)
    1471              :      */
    1472         2097 : restart:
    1473              : 
    1474              :     /*
    1475              :      * An important safety check: the current endpoint page must not be
    1476              :      * eligible for removal.  This check is just a backstop against wraparound
    1477              :      * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
    1478              :      * outdated value; therefore we don't add a memory barrier.
    1479              :      */
    1480         2097 :     if (ctl->options.PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number),
    1481              :                                   cutoffPage))
    1482              :     {
    1483            0 :         ereport(LOG,
    1484              :                 (errmsg("could not truncate directory \"%s\": apparent wraparound",
    1485              :                         ctl->options.Dir)));
    1486            0 :         return;
    1487              :     }
    1488              : 
    1489         2097 :     prevbank = SlotGetBankNumber(0);
    1490         2097 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1491        50757 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1492              :     {
    1493        48742 :         int         curbank = SlotGetBankNumber(slotno);
    1494              : 
    1495              :         /*
    1496              :          * If the current bank lock is not same as the previous bank lock then
    1497              :          * release the previous lock and acquire the new lock.
    1498              :          */
    1499        48742 :         if (curbank != prevbank)
    1500              :         {
    1501          989 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1502          989 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1503          989 :             prevbank = curbank;
    1504              :         }
    1505              : 
    1506        48742 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1507        43136 :             continue;
    1508         5606 :         if (!ctl->options.PagePrecedes(shared->page_number[slotno], cutoffPage))
    1509         5300 :             continue;
    1510              : 
    1511              :         /*
    1512              :          * If page is clean, just change state to EMPTY (expected case).
    1513              :          */
    1514          306 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1515          306 :             !shared->page_dirty[slotno])
    1516              :         {
    1517          224 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
    1518          224 :             continue;
    1519              :         }
    1520              : 
    1521              :         /*
    1522              :          * Hmm, we have (or may have) I/O operations acting on the page, so
    1523              :          * we've got to wait for them to finish and then start again. This is
    1524              :          * the same logic as in SlruSelectLRUPage.  (XXX if page is dirty,
    1525              :          * wouldn't it be OK to just discard it without writing it?
    1526              :          * SlruMayDeleteSegment() uses a stricter qualification, so we might
    1527              :          * not delete this page in the end; even if we don't delete it, we
    1528              :          * won't have cause to read its data again.  For now, keep the logic
    1529              :          * the same as it was.)
    1530              :          */
    1531           82 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1532           82 :             SlruInternalWritePage(ctl, slotno, NULL);
    1533              :         else
    1534            0 :             SimpleLruWaitIO(ctl, slotno);
    1535              : 
    1536           82 :         LWLockRelease(&shared->bank_locks[prevbank].lock);
    1537           82 :         goto restart;
    1538              :     }
    1539              : 
    1540         2015 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1541              : 
    1542              :     /* Now we can remove the old segment(s) */
    1543         2015 :     (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
    1544              : }
    1545              : 
    1546              : /*
    1547              :  * Delete an individual SLRU segment.
    1548              :  *
    1549              :  * NB: This does not touch the SLRU buffers themselves, callers have to ensure
    1550              :  * they either can't yet contain anything, or have already been cleaned out.
    1551              :  */
    1552              : static void
    1553       142003 : SlruInternalDeleteSegment(SlruDesc *ctl, int64 segno)
    1554              : {
    1555              :     char        path[MAXPGPATH];
    1556              : 
    1557              :     /* Forget any fsync requests queued for this segment. */
    1558       142003 :     if (ctl->options.sync_handler != SYNC_HANDLER_NONE)
    1559              :     {
    1560              :         FileTag     tag;
    1561              : 
    1562        13224 :         INIT_SLRUFILETAG(tag, ctl->options.sync_handler, segno);
    1563        13224 :         RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
    1564              :     }
    1565              : 
    1566              :     /* Unlink the file. */
    1567       142003 :     SlruFileName(ctl, path, segno);
    1568       142003 :     ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
    1569       142003 :     unlink(path);
    1570       142003 : }
    1571              : 
    1572              : /*
    1573              :  * Delete an individual SLRU segment, identified by the segment number.
    1574              :  */
    1575              : void
    1576            2 : SlruDeleteSegment(SlruDesc *ctl, int64 segno)
    1577              : {
    1578            2 :     SlruShared  shared = ctl->shared;
    1579            2 :     int         prevbank = SlotGetBankNumber(0);
    1580              :     bool        did_write;
    1581              : 
    1582              :     /* Clean out any possibly existing references to the segment. */
    1583            2 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1584            2 : restart:
    1585            2 :     did_write = false;
    1586           34 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1587              :     {
    1588              :         int64       pagesegno;
    1589           32 :         int         curbank = SlotGetBankNumber(slotno);
    1590              : 
    1591              :         /*
    1592              :          * If the current bank lock is not same as the previous bank lock then
    1593              :          * release the previous lock and acquire the new lock.
    1594              :          */
    1595           32 :         if (curbank != prevbank)
    1596              :         {
    1597            0 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1598            0 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1599            0 :             prevbank = curbank;
    1600              :         }
    1601              : 
    1602           32 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1603            0 :             continue;
    1604              : 
    1605           32 :         pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
    1606              :         /* not the segment we're looking for */
    1607           32 :         if (pagesegno != segno)
    1608            7 :             continue;
    1609              : 
    1610              :         /* If page is clean, just change state to EMPTY (expected case). */
    1611           25 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1612           25 :             !shared->page_dirty[slotno])
    1613              :         {
    1614           25 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
    1615           25 :             continue;
    1616              :         }
    1617              : 
    1618              :         /* Same logic as SimpleLruTruncate() */
    1619            0 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1620            0 :             SlruInternalWritePage(ctl, slotno, NULL);
    1621              :         else
    1622            0 :             SimpleLruWaitIO(ctl, slotno);
    1623              : 
    1624            0 :         did_write = true;
    1625              :     }
    1626              : 
    1627              :     /*
    1628              :      * Be extra careful and re-check. The IO functions release the control
    1629              :      * lock, so new pages could have been read in.
    1630              :      */
    1631            2 :     if (did_write)
    1632            0 :         goto restart;
    1633              : 
    1634            2 :     SlruInternalDeleteSegment(ctl, segno);
    1635              : 
    1636            2 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1637            2 : }
    1638              : 
    1639              : /*
    1640              :  * Determine whether a segment is okay to delete.
    1641              :  *
    1642              :  * segpage is the first page of the segment, and cutoffPage is the oldest (in
    1643              :  * PagePrecedes order) page in the SLRU containing still-useful data.  Since
    1644              :  * every core PagePrecedes callback implements "wrap around", check the
    1645              :  * segment's first and last pages:
    1646              :  *
    1647              :  * first<cutoff  && last<cutoff:  yes
    1648              :  * first<cutoff  && last>=cutoff: no; cutoff falls inside this segment
    1649              :  * first>=cutoff && last<cutoff:  no; wrap point falls inside this segment
    1650              :  * first>=cutoff && last>=cutoff: no; every page of this segment is too young
    1651              :  */
    1652              : static bool
    1653      1038072 : SlruMayDeleteSegment(SlruDesc *ctl, int64 segpage, int64 cutoffPage)
    1654              : {
    1655      1038072 :     int64       seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
    1656              : 
    1657              :     Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
    1658              : 
    1659      1180821 :     return (ctl->options.PagePrecedes(segpage, cutoffPage) &&
    1660       142749 :             ctl->options.PagePrecedes(seg_last_page, cutoffPage));
    1661              : }
    1662              : 
    1663              : #ifdef USE_ASSERT_CHECKING
    1664              : static void
    1665              : SlruPagePrecedesTestOffset(SlruDesc *ctl, int per_page, uint32 offset)
    1666              : {
    1667              :     TransactionId lhs,
    1668              :                 rhs;
    1669              :     int64       newestPage,
    1670              :                 oldestPage;
    1671              :     TransactionId newestXact,
    1672              :                 oldestXact;
    1673              : 
    1674              :     /* This must be called after the Slru has been initialized */
    1675              :     Assert(ctl->options.PagePrecedes);
    1676              : 
    1677              :     /*
    1678              :      * Compare an XID pair having undefined order (see RFC 1982), a pair at
    1679              :      * "opposite ends" of the XID space.  TransactionIdPrecedes() treats each
    1680              :      * as preceding the other.  If RHS is oldestXact, LHS is the first XID we
    1681              :      * must not assign.
    1682              :      */
    1683              :     lhs = per_page + offset;    /* skip first page to avoid non-normal XIDs */
    1684              :     rhs = lhs + (1U << 31);
    1685              :     Assert(TransactionIdPrecedes(lhs, rhs));
    1686              :     Assert(TransactionIdPrecedes(rhs, lhs));
    1687              :     Assert(!TransactionIdPrecedes(lhs - 1, rhs));
    1688              :     Assert(TransactionIdPrecedes(rhs, lhs - 1));
    1689              :     Assert(TransactionIdPrecedes(lhs + 1, rhs));
    1690              :     Assert(!TransactionIdPrecedes(rhs, lhs + 1));
    1691              :     Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
    1692              :     Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
    1693              :     Assert(!ctl->options.PagePrecedes(lhs / per_page, lhs / per_page));
    1694              :     Assert(!ctl->options.PagePrecedes(lhs / per_page, rhs / per_page));
    1695              :     Assert(!ctl->options.PagePrecedes(rhs / per_page, lhs / per_page));
    1696              :     Assert(!ctl->options.PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
    1697              :     Assert(ctl->options.PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
    1698              :     Assert(ctl->options.PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
    1699              :     Assert(ctl->options.PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
    1700              :            || (1U << 31) % per_page != 0);    /* See CommitTsPagePrecedes() */
    1701              :     Assert(ctl->options.PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
    1702              :            || (1U << 31) % per_page != 0);
    1703              :     Assert(ctl->options.PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
    1704              :     Assert(ctl->options.PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
    1705              :     Assert(!ctl->options.PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
    1706              : 
    1707              :     /*
    1708              :      * GetNewTransactionId() has assigned the last XID it can safely use, and
    1709              :      * that XID is in the *LAST* page of the second segment.  We must not
    1710              :      * delete that segment.
    1711              :      */
    1712              :     newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
    1713              :     newestXact = newestPage * per_page + offset;
    1714              :     Assert(newestXact / per_page == newestPage);
    1715              :     oldestXact = newestXact + 1;
    1716              :     oldestXact -= 1U << 31;
    1717              :     oldestPage = oldestXact / per_page;
    1718              :     Assert(!SlruMayDeleteSegment(ctl,
    1719              :                                  (newestPage -
    1720              :                                   newestPage % SLRU_PAGES_PER_SEGMENT),
    1721              :                                  oldestPage));
    1722              : 
    1723              :     /*
    1724              :      * GetNewTransactionId() has assigned the last XID it can safely use, and
    1725              :      * that XID is in the *FIRST* page of the second segment.  We must not
    1726              :      * delete that segment.
    1727              :      */
    1728              :     newestPage = SLRU_PAGES_PER_SEGMENT;
    1729              :     newestXact = newestPage * per_page + offset;
    1730              :     Assert(newestXact / per_page == newestPage);
    1731              :     oldestXact = newestXact + 1;
    1732              :     oldestXact -= 1U << 31;
    1733              :     oldestPage = oldestXact / per_page;
    1734              :     Assert(!SlruMayDeleteSegment(ctl,
    1735              :                                  (newestPage -
    1736              :                                   newestPage % SLRU_PAGES_PER_SEGMENT),
    1737              :                                  oldestPage));
    1738              : }
    1739              : 
    1740              : /*
    1741              :  * Unit-test a PagePrecedes function.
    1742              :  *
    1743              :  * This assumes every uint32 >= FirstNormalTransactionId is a valid key.  It
    1744              :  * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
    1745              :  * (MultiXactMemberCtl separates flags from XIDs.  NotifyCtl has
    1746              :  * variable-length entries, no keys, and no random access.  These unit tests
    1747              :  * do not apply to them.)
    1748              :  */
    1749              : void
    1750              : SlruPagePrecedesUnitTests(SlruDesc *ctl, int per_page)
    1751              : {
    1752              :     /* Test first, middle and last entries of a page. */
    1753              :     SlruPagePrecedesTestOffset(ctl, per_page, 0);
    1754              :     SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
    1755              :     SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
    1756              : }
    1757              : #endif
    1758              : 
    1759              : /*
    1760              :  * SlruScanDirectory callback
    1761              :  *      This callback reports true if there's any segment wholly prior to the
    1762              :  *      one containing the page passed as "data".
    1763              :  */
    1764              : bool
    1765       826533 : SlruScanDirCbReportPresence(SlruDesc *ctl, char *filename, int64 segpage,
    1766              :                             void *data)
    1767              : {
    1768       826533 :     int64       cutoffPage = *(int64 *) data;
    1769              : 
    1770       826533 :     if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
    1771          103 :         return true;            /* found one; don't iterate any more */
    1772              : 
    1773       826430 :     return false;               /* keep going */
    1774              : }
    1775              : 
    1776              : /*
    1777              :  * SlruScanDirectory callback.
    1778              :  *      This callback deletes segments prior to the one passed in as "data".
    1779              :  */
    1780              : static bool
    1781       211539 : SlruScanDirCbDeleteCutoff(SlruDesc *ctl, char *filename, int64 segpage,
    1782              :                           void *data)
    1783              : {
    1784       211539 :     int64       cutoffPage = *(int64 *) data;
    1785              : 
    1786       211539 :     if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
    1787       141993 :         SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
    1788              : 
    1789       211539 :     return false;               /* keep going */
    1790              : }
    1791              : 
    1792              : /*
    1793              :  * SlruScanDirectory callback.
    1794              :  *      This callback deletes all segments.
    1795              :  */
    1796              : bool
    1797            8 : SlruScanDirCbDeleteAll(SlruDesc *ctl, char *filename, int64 segpage, void *data)
    1798              : {
    1799            8 :     SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
    1800              : 
    1801            8 :     return false;               /* keep going */
    1802              : }
    1803              : 
    1804              : /*
    1805              :  * An internal function used by SlruScanDirectory().
    1806              :  *
    1807              :  * Returns true if a file with a name of a given length may be a correct
    1808              :  * SLRU segment.
    1809              :  */
    1810              : static inline bool
    1811      1050810 : SlruCorrectSegmentFilenameLength(SlruDesc *ctl, size_t len)
    1812              : {
    1813      1050810 :     if (ctl->options.long_segment_names)
    1814         2485 :         return (len == 15);     /* see SlruFileName() */
    1815              :     else
    1816              : 
    1817              :         /*
    1818              :          * Commit 638cf09e76d allowed 5-character lengths. Later commit
    1819              :          * 73c986adde5 allowed 6-character length.
    1820              :          *
    1821              :          * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
    1822              :          * numbers, and the corresponding 15-character file names, which may
    1823              :          * eventually deprecate the support for 4, 5, and 6-character names.
    1824              :          */
    1825      1048325 :         return (len == 4 || len == 5 || len == 6);
    1826              : }
    1827              : 
    1828              : /*
    1829              :  * Scan the SimpleLru directory and apply a callback to each file found in it.
    1830              :  *
    1831              :  * If the callback returns true, the scan is stopped.  The last return value
    1832              :  * from the callback is returned.
    1833              :  *
    1834              :  * The callback receives the following arguments: 1. the SlruCtl struct for the
    1835              :  * slru being truncated; 2. the filename being considered; 3. the page number
    1836              :  * for the first page of that file; 4. a pointer to the opaque data given to us
    1837              :  * by the caller.
    1838              :  *
    1839              :  * Note that the ordering in which the directory is scanned is not guaranteed.
    1840              :  *
    1841              :  * Note that no locking is applied.
    1842              :  */
    1843              : bool
    1844         6434 : SlruScanDirectory(SlruDesc *ctl, SlruScanCallback callback, void *data)
    1845              : {
    1846         6434 :     bool        retval = false;
    1847              :     DIR        *cldir;
    1848              :     struct dirent *clde;
    1849              :     int64       segno;
    1850              :     int64       segpage;
    1851              : 
    1852         6434 :     cldir = AllocateDir(ctl->options.Dir);
    1853      1057141 :     while ((clde = ReadDir(cldir, ctl->options.Dir)) != NULL)
    1854              :     {
    1855              :         size_t      len;
    1856              : 
    1857      1050810 :         len = strlen(clde->d_name);
    1858              : 
    1859      1050810 :         if (SlruCorrectSegmentFilenameLength(ctl, len) &&
    1860      1038080 :             strspn(clde->d_name, "0123456789ABCDEF") == len)
    1861              :         {
    1862      1038080 :             segno = strtoi64(clde->d_name, NULL, 16);
    1863      1038080 :             segpage = segno * SLRU_PAGES_PER_SEGMENT;
    1864              : 
    1865      1038080 :             elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
    1866              :                  ctl->options.Dir, clde->d_name);
    1867      1038080 :             retval = callback(ctl, clde->d_name, segpage, data);
    1868      1038080 :             if (retval)
    1869          103 :                 break;
    1870              :         }
    1871              :     }
    1872         6434 :     FreeDir(cldir);
    1873              : 
    1874         6434 :     return retval;
    1875              : }
    1876              : 
    1877              : /*
    1878              :  * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
    1879              :  * that they can provide the correct "SlruCtl" (otherwise we don't know how to
    1880              :  * build the path), but they just forward to this common implementation that
    1881              :  * performs the fsync.
    1882              :  */
    1883              : int
    1884            2 : SlruSyncFileTag(SlruDesc *ctl, const FileTag *ftag, char *path)
    1885              : {
    1886              :     int         fd;
    1887              :     int         save_errno;
    1888              :     int         result;
    1889              : 
    1890            2 :     SlruFileName(ctl, path, ftag->segno);
    1891              : 
    1892            2 :     fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
    1893            2 :     if (fd < 0)
    1894            0 :         return -1;
    1895              : 
    1896            2 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
    1897            2 :     result = pg_fsync(fd);
    1898            2 :     pgstat_report_wait_end();
    1899            2 :     save_errno = errno;
    1900              : 
    1901            2 :     CloseTransientFile(fd);
    1902              : 
    1903            2 :     errno = save_errno;
    1904            2 :     return result;
    1905              : }
        

Generated by: LCOV version 2.0-1