LCOV - code coverage report
Current view: top level - src/backend/access/transam - slru.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 79.4 % 534 424
Test Date: 2026-03-01 17:14:43 Functions: 96.7 % 30 29
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * slru.c
       4              :  *      Simple LRU buffering for wrap-around-able permanent metadata
       5              :  *
       6              :  * This module is used to maintain various pieces of transaction status
       7              :  * indexed by TransactionId (such as commit status, parent transaction ID,
       8              :  * commit timestamp), as well as storage for multixacts, serializable
       9              :  * isolation locks and NOTIFY traffic.  Extensions can define their own
      10              :  * SLRUs, too.
      11              :  *
      12              :  * Under ordinary circumstances we expect that write traffic will occur
      13              :  * mostly to the latest page (and to the just-prior page, soon after a
      14              :  * page transition).  Read traffic will probably touch a larger span of
      15              :  * pages, but a relatively small number of buffers should be sufficient.
      16              :  *
      17              :  * We use a simple least-recently-used scheme to manage a pool of shared
      18              :  * page buffers, split in banks by the lowest bits of the page number, and
      19              :  * the management algorithm only processes the bank to which the desired
      20              :  * page belongs, so a linear search is sufficient; there's no need for a
      21              :  * hashtable or anything fancy.  The algorithm is straight LRU except that
      22              :  * we will never swap out the latest page (since we know it's going to be
      23              :  * hit again eventually).
      24              :  *
      25              :  * We use per-bank control LWLocks to protect the shared data structures,
      26              :  * plus per-buffer LWLocks that synchronize I/O for each buffer.  The
      27              :  * bank's control lock must be held to examine or modify any of the bank's
      28              :  * shared state.  A process that is reading in or writing out a page
      29              :  * buffer does not hold the control lock, only the per-buffer lock for the
      30              :  * buffer it is working on.  One exception is latest_page_number, which is
      31              :  * read and written using atomic ops.
      32              :  *
      33              :  * "Holding the bank control lock" means exclusive lock in all cases
      34              :  * except for SimpleLruReadPage_ReadOnly(); see comments for
      35              :  * SlruRecentlyUsed() for the implications of that.
      36              :  *
      37              :  * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
      38              :  * before releasing the control lock.  The per-buffer lock is released after
      39              :  * completing the I/O, re-acquiring the control lock, and updating the shared
      40              :  * state.  (Deadlock is not possible here, because we never try to initiate
      41              :  * I/O when someone else is already doing I/O on the same buffer.)
      42              :  * To wait for I/O to complete, release the control lock, acquire the
      43              :  * per-buffer lock in shared mode, immediately release the per-buffer lock,
      44              :  * reacquire the control lock, and then recheck state (since arbitrary things
      45              :  * could have happened while we didn't have the lock).
      46              :  *
      47              :  * As with the regular buffer manager, it is possible for another process
      48              :  * to re-dirty a page that is currently being written out.  This is handled
      49              :  * by re-setting the page's page_dirty flag.
      50              :  *
      51              :  *
      52              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      53              :  * Portions Copyright (c) 1994, Regents of the University of California
      54              :  *
      55              :  * src/backend/access/transam/slru.c
      56              :  *
      57              :  *-------------------------------------------------------------------------
      58              :  */
      59              : #include "postgres.h"
      60              : 
      61              : #include <fcntl.h>
      62              : #include <sys/stat.h>
      63              : #include <unistd.h>
      64              : 
      65              : #include "access/slru.h"
      66              : #include "access/transam.h"
      67              : #include "access/xlog.h"
      68              : #include "access/xlogutils.h"
      69              : #include "miscadmin.h"
      70              : #include "pgstat.h"
      71              : #include "storage/fd.h"
      72              : #include "storage/shmem.h"
      73              : #include "utils/guc.h"
      74              : 
      75              : /*
      76              :  * Converts segment number to the filename of the segment.
      77              :  *
      78              :  * "path" should point to a buffer at least MAXPGPATH characters long.
      79              :  *
      80              :  * If ctl->long_segment_names is true, segno can be in the range [0, 2^60-1].
      81              :  * The resulting file name is made of 15 characters, e.g. dir/123456789ABCDEF.
      82              :  *
      83              :  * If ctl->long_segment_names is false, segno can be in the range [0, 2^24-1].
      84              :  * The resulting file name is made of 4 to 6 characters, as of:
      85              :  *
      86              :  *  dir/1234   for [0, 2^16-1]
      87              :  *  dir/12345  for [2^16, 2^20-1]
      88              :  *  dir/123456 for [2^20, 2^24-1]
      89              :  */
      90              : static inline int
      91      7505932 : SlruFileName(SlruCtl ctl, char *path, int64 segno)
      92              : {
      93      7505932 :     if (ctl->long_segment_names)
      94              :     {
      95              :         /*
      96              :          * We could use 16 characters here but the disadvantage would be that
      97              :          * the SLRU segments will be hard to distinguish from WAL segments.
      98              :          *
      99              :          * For this reason we use 15 characters. It is enough but also means
     100              :          * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
     101              :          */
     102              :         Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
     103        16610 :         return snprintf(path, MAXPGPATH, "%s/%015" PRIX64, ctl->Dir, segno);
     104              :     }
     105              :     else
     106              :     {
     107              :         /*
     108              :          * Despite the fact that %04X format string is used up to 24 bit
     109              :          * integers are allowed. See SlruCorrectSegmentFilenameLength()
     110              :          */
     111              :         Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
     112      7489322 :         return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir,
     113              :                         (unsigned int) segno);
     114              :     }
     115              : }
     116              : 
     117              : /*
     118              :  * During SimpleLruWriteAll(), we will usually not need to write more than one
     119              :  * or two physical files, but we may need to write several pages per file.  We
     120              :  * can consolidate the I/O requests by leaving files open until control returns
     121              :  * to SimpleLruWriteAll().  This data structure remembers which files are open.
     122              :  */
     123              : #define MAX_WRITEALL_BUFFERS    16
     124              : 
     125              : typedef struct SlruWriteAllData
     126              : {
     127              :     int         num_files;      /* # files actually open */
     128              :     int         fd[MAX_WRITEALL_BUFFERS];   /* their FD's */
     129              :     int64       segno[MAX_WRITEALL_BUFFERS];    /* their log seg#s */
     130              : } SlruWriteAllData;
     131              : 
     132              : typedef struct SlruWriteAllData *SlruWriteAll;
     133              : 
     134              : 
     135              : /*
     136              :  * Bank size for the slot array.  Pages are assigned a bank according to their
     137              :  * page number, with each bank being this size.  We want a power of 2 so that
     138              :  * we can determine the bank number for a page with just bit shifting; we also
     139              :  * want to keep the bank size small so that LRU victim search is fast.  16
     140              :  * buffers per bank seems a good number.
     141              :  */
     142              : #define SLRU_BANK_BITSHIFT      4
     143              : #define SLRU_BANK_SIZE          (1 << SLRU_BANK_BITSHIFT)
     144              : 
     145              : /*
     146              :  * Macro to get the bank number to which the slot belongs.
     147              :  */
     148              : #define SlotGetBankNumber(slotno)   ((slotno) >> SLRU_BANK_BITSHIFT)
     149              : 
     150              : 
     151              : /*
     152              :  * Populate a file tag describing a segment file.  We only use the segment
     153              :  * number, since we can derive everything else we need by having separate
     154              :  * sync handler functions for clog, multixact etc.
     155              :  */
     156              : #define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
     157              : ( \
     158              :     memset(&(a), 0, sizeof(FileTag)), \
     159              :     (a).handler = (xx_handler), \
     160              :     (a).segno = (xx_segno) \
     161              : )
     162              : 
     163              : /* Saved info for SlruReportIOError */
     164              : typedef enum
     165              : {
     166              :     SLRU_OPEN_FAILED,
     167              :     SLRU_SEEK_FAILED,
     168              :     SLRU_READ_FAILED,
     169              :     SLRU_WRITE_FAILED,
     170              :     SLRU_FSYNC_FAILED,
     171              :     SLRU_CLOSE_FAILED,
     172              : } SlruErrorCause;
     173              : 
     174              : static SlruErrorCause slru_errcause;
     175              : static int  slru_errno;
     176              : 
     177              : 
     178              : static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
     179              : static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
     180              : static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
     181              : static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
     182              : static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno,
     183              :                                   SlruWriteAll fdata);
     184              : static void SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid);
     185              : static int  SlruSelectLRUPage(SlruCtl ctl, int64 pageno);
     186              : 
     187              : static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
     188              :                                       int64 segpage, void *data);
     189              : static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
     190              : static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
     191              : 
     192              : 
     193              : /*
     194              :  * Initialization of shared memory
     195              :  */
     196              : 
     197              : Size
     198        23087 : SimpleLruShmemSize(int nslots, int nlsns)
     199              : {
     200        23087 :     int         nbanks = nslots / SLRU_BANK_SIZE;
     201              :     Size        sz;
     202              : 
     203              :     Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
     204              :     Assert(nslots % SLRU_BANK_SIZE == 0);
     205              : 
     206              :     /* we assume nslots isn't so large as to risk overflow */
     207        23087 :     sz = MAXALIGN(sizeof(SlruSharedData));
     208        23087 :     sz += MAXALIGN(nslots * sizeof(char *));    /* page_buffer[] */
     209        23087 :     sz += MAXALIGN(nslots * sizeof(SlruPageStatus));    /* page_status[] */
     210        23087 :     sz += MAXALIGN(nslots * sizeof(bool));  /* page_dirty[] */
     211        23087 :     sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
     212        23087 :     sz += MAXALIGN(nslots * sizeof(int));   /* page_lru_count[] */
     213        23087 :     sz += MAXALIGN(nslots * sizeof(LWLockPadded));  /* buffer_locks[] */
     214        23087 :     sz += MAXALIGN(nbanks * sizeof(LWLockPadded));  /* bank_locks[] */
     215        23087 :     sz += MAXALIGN(nbanks * sizeof(int));   /* bank_cur_lru_count[] */
     216              : 
     217        23087 :     if (nlsns > 0)
     218         3297 :         sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));    /* group_lsn[] */
     219              : 
     220        23087 :     return BUFFERALIGN(sz) + BLCKSZ * nslots;
     221              : }
     222              : 
     223              : /*
     224              :  * Determine a number of SLRU buffers to use.
     225              :  *
     226              :  * We simply divide shared_buffers by the divisor given and cap
     227              :  * that at the maximum given; but always at least SLRU_BANK_SIZE.
     228              :  * Round down to the nearest multiple of SLRU_BANK_SIZE.
     229              :  */
     230              : int
     231         9840 : SimpleLruAutotuneBuffers(int divisor, int max)
     232              : {
     233         9840 :     return Min(max - (max % SLRU_BANK_SIZE),
     234              :                Max(SLRU_BANK_SIZE,
     235              :                    NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE));
     236              : }
     237              : 
     238              : /*
     239              :  * Initialize, or attach to, a simple LRU cache in shared memory.
     240              :  *
     241              :  * ctl: address of local (unshared) control structure.
     242              :  * name: name of SLRU.  (This is user-visible, pick with care!)
     243              :  * nslots: number of page slots to use.
     244              :  * nlsns: number of LSN groups per page (set to zero if not relevant).
     245              :  * subdir: PGDATA-relative subdirectory that will contain the files.
     246              :  * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks.
     247              :  * bank_tranche_id: tranche ID to use for the bank LWLocks.
     248              :  * sync_handler: which set of functions to use to handle sync requests
     249              :  * long_segment_names: use short or long segment names
     250              :  */
     251              : void
     252         8054 : SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
     253              :               const char *subdir, int buffer_tranche_id, int bank_tranche_id,
     254              :               SyncRequestHandler sync_handler, bool long_segment_names)
     255              : {
     256              :     SlruShared  shared;
     257              :     bool        found;
     258         8054 :     int         nbanks = nslots / SLRU_BANK_SIZE;
     259              : 
     260              :     Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
     261              : 
     262         8054 :     shared = (SlruShared) ShmemInitStruct(name,
     263              :                                           SimpleLruShmemSize(nslots, nlsns),
     264              :                                           &found);
     265              : 
     266         8054 :     if (!IsUnderPostmaster)
     267              :     {
     268              :         /* Initialize locks and shared memory area */
     269              :         char       *ptr;
     270              :         Size        offset;
     271              : 
     272              :         Assert(!found);
     273              : 
     274         8054 :         memset(shared, 0, sizeof(SlruSharedData));
     275              : 
     276         8054 :         shared->num_slots = nslots;
     277         8054 :         shared->lsn_groups_per_page = nlsns;
     278              : 
     279         8054 :         pg_atomic_init_u64(&shared->latest_page_number, 0);
     280              : 
     281         8054 :         shared->slru_stats_idx = pgstat_get_slru_index(name);
     282              : 
     283         8054 :         ptr = (char *) shared;
     284         8054 :         offset = MAXALIGN(sizeof(SlruSharedData));
     285         8054 :         shared->page_buffer = (char **) (ptr + offset);
     286         8054 :         offset += MAXALIGN(nslots * sizeof(char *));
     287         8054 :         shared->page_status = (SlruPageStatus *) (ptr + offset);
     288         8054 :         offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
     289         8054 :         shared->page_dirty = (bool *) (ptr + offset);
     290         8054 :         offset += MAXALIGN(nslots * sizeof(bool));
     291         8054 :         shared->page_number = (int64 *) (ptr + offset);
     292         8054 :         offset += MAXALIGN(nslots * sizeof(int64));
     293         8054 :         shared->page_lru_count = (int *) (ptr + offset);
     294         8054 :         offset += MAXALIGN(nslots * sizeof(int));
     295              : 
     296              :         /* Initialize LWLocks */
     297         8054 :         shared->buffer_locks = (LWLockPadded *) (ptr + offset);
     298         8054 :         offset += MAXALIGN(nslots * sizeof(LWLockPadded));
     299         8054 :         shared->bank_locks = (LWLockPadded *) (ptr + offset);
     300         8054 :         offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
     301         8054 :         shared->bank_cur_lru_count = (int *) (ptr + offset);
     302         8054 :         offset += MAXALIGN(nbanks * sizeof(int));
     303              : 
     304         8054 :         if (nlsns > 0)
     305              :         {
     306         1150 :             shared->group_lsn = (XLogRecPtr *) (ptr + offset);
     307         1150 :             offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
     308              :         }
     309              : 
     310         8054 :         ptr += BUFFERALIGN(offset);
     311       205030 :         for (int slotno = 0; slotno < nslots; slotno++)
     312              :         {
     313       196976 :             LWLockInitialize(&shared->buffer_locks[slotno].lock,
     314              :                              buffer_tranche_id);
     315              : 
     316       196976 :             shared->page_buffer[slotno] = ptr;
     317       196976 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
     318       196976 :             shared->page_dirty[slotno] = false;
     319       196976 :             shared->page_lru_count[slotno] = 0;
     320       196976 :             ptr += BLCKSZ;
     321              :         }
     322              : 
     323              :         /* Initialize the slot banks. */
     324        20365 :         for (int bankno = 0; bankno < nbanks; bankno++)
     325              :         {
     326        12311 :             LWLockInitialize(&shared->bank_locks[bankno].lock, bank_tranche_id);
     327        12311 :             shared->bank_cur_lru_count[bankno] = 0;
     328              :         }
     329              : 
     330              :         /* Should fit to estimated shmem size */
     331              :         Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
     332              :     }
     333              :     else
     334              :     {
     335              :         Assert(found);
     336              :         Assert(shared->num_slots == nslots);
     337              :     }
     338              : 
     339              :     /*
     340              :      * Initialize the unshared control struct, including directory path. We
     341              :      * assume caller set PagePrecedes.
     342              :      */
     343         8054 :     ctl->shared = shared;
     344         8054 :     ctl->sync_handler = sync_handler;
     345         8054 :     ctl->long_segment_names = long_segment_names;
     346         8054 :     ctl->nbanks = nbanks;
     347         8054 :     strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
     348         8054 : }
     349              : 
     350              : /*
     351              :  * Helper function for GUC check_hook to check whether slru buffers are in
     352              :  * multiples of SLRU_BANK_SIZE.
     353              :  */
     354              : bool
     355        11769 : check_slru_buffers(const char *name, int *newval)
     356              : {
     357              :     /* Valid values are multiples of SLRU_BANK_SIZE */
     358        11769 :     if (*newval % SLRU_BANK_SIZE == 0)
     359        11769 :         return true;
     360              : 
     361            0 :     GUC_check_errdetail("\"%s\" must be a multiple of %d.", name,
     362              :                         SLRU_BANK_SIZE);
     363            0 :     return false;
     364              : }
     365              : 
     366              : /*
     367              :  * Initialize (or reinitialize) a page to zeroes.
     368              :  *
     369              :  * The page is not actually written, just set up in shared memory.
     370              :  * The slot number of the new page is returned.
     371              :  *
     372              :  * Bank lock must be held at entry, and will be held at exit.
     373              :  */
     374              : int
     375      7345822 : SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
     376              : {
     377      7345822 :     SlruShared  shared = ctl->shared;
     378              :     int         slotno;
     379              : 
     380              :     Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
     381              : 
     382              :     /* Find a suitable buffer slot for the page */
     383      7345822 :     slotno = SlruSelectLRUPage(ctl, pageno);
     384              :     Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
     385              :            (shared->page_status[slotno] == SLRU_PAGE_VALID &&
     386              :             !shared->page_dirty[slotno]) ||
     387              :            shared->page_number[slotno] == pageno);
     388              : 
     389              :     /* Mark the slot as containing this page */
     390      7345822 :     shared->page_number[slotno] = pageno;
     391      7345822 :     shared->page_status[slotno] = SLRU_PAGE_VALID;
     392      7345822 :     shared->page_dirty[slotno] = true;
     393      7345822 :     SlruRecentlyUsed(shared, slotno);
     394              : 
     395              :     /* Set the buffer to zeroes */
     396      7345822 :     MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
     397              : 
     398              :     /* Set the LSNs for this new page to zero */
     399      7345822 :     SimpleLruZeroLSNs(ctl, slotno);
     400              : 
     401              :     /*
     402              :      * Assume this page is now the latest active page.
     403              :      *
     404              :      * Note that because both this routine and SlruSelectLRUPage run with a
     405              :      * SLRU bank lock held, it is not possible for this to be zeroing a page
     406              :      * that SlruSelectLRUPage is going to evict simultaneously.  Therefore,
     407              :      * there's no memory barrier here.
     408              :      */
     409      7345822 :     pg_atomic_write_u64(&shared->latest_page_number, pageno);
     410              : 
     411              :     /* update the stats counter of zeroed pages */
     412      7345822 :     pgstat_count_slru_blocks_zeroed(shared->slru_stats_idx);
     413              : 
     414      7345822 :     return slotno;
     415              : }
     416              : 
     417              : /*
     418              :  * Zero all the LSNs we store for this slru page.
     419              :  *
     420              :  * This should be called each time we create a new page, and each time we read
     421              :  * in a page from disk into an existing buffer.  (Such an old page cannot
     422              :  * have any interesting LSNs, since we'd have flushed them before writing
     423              :  * the page in the first place.)
     424              :  *
     425              :  * This assumes that InvalidXLogRecPtr is bitwise-all-0.
     426              :  */
     427              : static void
     428      7363368 : SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
     429              : {
     430      7363368 :     SlruShared  shared = ctl->shared;
     431              : 
     432      7363368 :     if (shared->lsn_groups_per_page > 0)
     433       433220 :         MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
     434              :                shared->lsn_groups_per_page * sizeof(XLogRecPtr));
     435      7363368 : }
     436              : 
     437              : /*
     438              :  * This is a convenience wrapper for the common case of zeroing a page and
     439              :  * immediately flushing it to disk.
     440              :  *
     441              :  * SLRU bank lock is acquired and released here.
     442              :  */
     443              : void
     444          218 : SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno)
     445              : {
     446              :     int         slotno;
     447              :     LWLock     *lock;
     448              : 
     449          218 :     lock = SimpleLruGetBankLock(ctl, pageno);
     450          218 :     LWLockAcquire(lock, LW_EXCLUSIVE);
     451              : 
     452              :     /* Create and zero the page */
     453          218 :     slotno = SimpleLruZeroPage(ctl, pageno);
     454              : 
     455              :     /* Make sure it's written out */
     456          218 :     SimpleLruWritePage(ctl, slotno);
     457              :     Assert(!ctl->shared->page_dirty[slotno]);
     458              : 
     459          218 :     LWLockRelease(lock);
     460          218 : }
     461              : 
     462              : /*
     463              :  * Wait for any active I/O on a page slot to finish.  (This does not
     464              :  * guarantee that new I/O hasn't been started before we return, though.
     465              :  * In fact the slot might not even contain the same page anymore.)
     466              :  *
     467              :  * Bank lock must be held at entry, and will be held at exit.
     468              :  */
     469              : static void
     470           10 : SimpleLruWaitIO(SlruCtl ctl, int slotno)
     471              : {
     472           10 :     SlruShared  shared = ctl->shared;
     473           10 :     int         bankno = SlotGetBankNumber(slotno);
     474              : 
     475              :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     476              : 
     477              :     /* See notes at top of file */
     478           10 :     LWLockRelease(&shared->bank_locks[bankno].lock);
     479           10 :     LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
     480           10 :     LWLockRelease(&shared->buffer_locks[slotno].lock);
     481           10 :     LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
     482              : 
     483              :     /*
     484              :      * If the slot is still in an io-in-progress state, then either someone
     485              :      * already started a new I/O on the slot, or a previous I/O failed and
     486              :      * neglected to reset the page state.  That shouldn't happen, really, but
     487              :      * it seems worth a few extra cycles to check and recover from it. We can
     488              :      * cheaply test for failure by seeing if the buffer lock is still held (we
     489              :      * assume that transaction abort would release the lock).
     490              :      */
     491           10 :     if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
     492           10 :         shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
     493              :     {
     494            0 :         if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
     495              :         {
     496              :             /* indeed, the I/O must have failed */
     497            0 :             if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
     498            0 :                 shared->page_status[slotno] = SLRU_PAGE_EMPTY;
     499              :             else                /* write_in_progress */
     500              :             {
     501            0 :                 shared->page_status[slotno] = SLRU_PAGE_VALID;
     502            0 :                 shared->page_dirty[slotno] = true;
     503              :             }
     504            0 :             LWLockRelease(&shared->buffer_locks[slotno].lock);
     505              :         }
     506              :     }
     507           10 : }
     508              : 
     509              : /*
     510              :  * Find a page in a shared buffer, reading it in if necessary.
     511              :  * The page number must correspond to an already-initialized page.
     512              :  *
     513              :  * If write_ok is true then it is OK to return a page that is in
     514              :  * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
     515              :  * that modification of the page is safe.  If write_ok is false then we
     516              :  * will not return the page until it is not undergoing active I/O.
     517              :  *
     518              :  * The passed-in xid is used only for error reporting, and may be
     519              :  * InvalidTransactionId if no specific xid is associated with the action.
     520              :  *
     521              :  * Return value is the shared-buffer slot number now holding the page.
     522              :  * The buffer's LRU access info is updated.
     523              :  *
     524              :  * The correct bank lock must be held at entry, and will be held at exit.
     525              :  */
     526              : int
     527       365148 : SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
     528              :                   TransactionId xid)
     529              : {
     530       365148 :     SlruShared  shared = ctl->shared;
     531       365148 :     LWLock     *banklock = SimpleLruGetBankLock(ctl, pageno);
     532              : 
     533              :     Assert(LWLockHeldByMeInMode(banklock, LW_EXCLUSIVE));
     534              : 
     535              :     /* Outer loop handles restart if we must wait for someone else's I/O */
     536              :     for (;;)
     537            2 :     {
     538              :         int         slotno;
     539              :         bool        ok;
     540              : 
     541              :         /* See if page already is in memory; if not, pick victim slot */
     542       365150 :         slotno = SlruSelectLRUPage(ctl, pageno);
     543              : 
     544              :         /* Did we find the page in memory? */
     545       365150 :         if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
     546       364090 :             shared->page_number[slotno] == pageno)
     547              :         {
     548              :             /*
     549              :              * If page is still being read in, we must wait for I/O.  Likewise
     550              :              * if the page is being written and the caller said that's not OK.
     551              :              */
     552       347604 :             if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
     553       347604 :                 (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
     554            3 :                  !write_ok))
     555              :             {
     556            2 :                 SimpleLruWaitIO(ctl, slotno);
     557              :                 /* Now we must recheck state from the top */
     558            2 :                 continue;
     559              :             }
     560              :             /* Otherwise, it's ready to use */
     561       347602 :             SlruRecentlyUsed(shared, slotno);
     562              : 
     563              :             /* update the stats counter of pages found in the SLRU */
     564       347602 :             pgstat_count_slru_blocks_hit(shared->slru_stats_idx);
     565              : 
     566       347602 :             return slotno;
     567              :         }
     568              : 
     569              :         /* We found no match; assert we selected a freeable slot */
     570              :         Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
     571              :                (shared->page_status[slotno] == SLRU_PAGE_VALID &&
     572              :                 !shared->page_dirty[slotno]));
     573              : 
     574              :         /* Mark the slot read-busy */
     575        17546 :         shared->page_number[slotno] = pageno;
     576        17546 :         shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
     577        17546 :         shared->page_dirty[slotno] = false;
     578              : 
     579              :         /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
     580        17546 :         LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
     581              : 
     582              :         /* Release bank lock while doing I/O */
     583        17546 :         LWLockRelease(banklock);
     584              : 
     585              :         /* Do the read */
     586        17546 :         ok = SlruPhysicalReadPage(ctl, pageno, slotno);
     587              : 
     588              :         /* Set the LSNs for this newly read-in page to zero */
     589        17546 :         SimpleLruZeroLSNs(ctl, slotno);
     590              : 
     591              :         /* Re-acquire bank control lock and update page state */
     592        17546 :         LWLockAcquire(banklock, LW_EXCLUSIVE);
     593              : 
     594              :         Assert(shared->page_number[slotno] == pageno &&
     595              :                shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
     596              :                !shared->page_dirty[slotno]);
     597              : 
     598        17546 :         shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
     599              : 
     600        17546 :         LWLockRelease(&shared->buffer_locks[slotno].lock);
     601              : 
     602              :         /* Now it's okay to ereport if we failed */
     603        17546 :         if (!ok)
     604            0 :             SlruReportIOError(ctl, pageno, xid);
     605              : 
     606        17546 :         SlruRecentlyUsed(shared, slotno);
     607              : 
     608              :         /* update the stats counter of pages not found in SLRU */
     609        17546 :         pgstat_count_slru_blocks_read(shared->slru_stats_idx);
     610              : 
     611        17546 :         return slotno;
     612              :     }
     613              : }
     614              : 
     615              : /*
     616              :  * Find a page in a shared buffer, reading it in if necessary.
     617              :  * The page number must correspond to an already-initialized page.
     618              :  * The caller must intend only read-only access to the page.
     619              :  *
     620              :  * The passed-in xid is used only for error reporting, and may be
     621              :  * InvalidTransactionId if no specific xid is associated with the action.
     622              :  *
     623              :  * Return value is the shared-buffer slot number now holding the page.
     624              :  * The buffer's LRU access info is updated.
     625              :  *
     626              :  * Bank control lock must NOT be held at entry, but will be held at exit.
     627              :  * It is unspecified whether the lock will be shared or exclusive.
     628              :  */
     629              : int
     630       813264 : SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
     631              : {
     632       813264 :     SlruShared  shared = ctl->shared;
     633       813264 :     LWLock     *banklock = SimpleLruGetBankLock(ctl, pageno);
     634       813264 :     int         bankno = pageno % ctl->nbanks;
     635       813264 :     int         bankstart = bankno * SLRU_BANK_SIZE;
     636       813264 :     int         bankend = bankstart + SLRU_BANK_SIZE;
     637              : 
     638              :     /* Try to find the page while holding only shared lock */
     639       813264 :     LWLockAcquire(banklock, LW_SHARED);
     640              : 
     641              :     /* See if page is already in a buffer */
     642       818886 :     for (int slotno = bankstart; slotno < bankend; slotno++)
     643              :     {
     644       818690 :         if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
     645       817747 :             shared->page_number[slotno] == pageno &&
     646       813068 :             shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
     647              :         {
     648              :             /* See comments for SlruRecentlyUsed() */
     649       813068 :             SlruRecentlyUsed(shared, slotno);
     650              : 
     651              :             /* update the stats counter of pages found in the SLRU */
     652       813068 :             pgstat_count_slru_blocks_hit(shared->slru_stats_idx);
     653              : 
     654       813068 :             return slotno;
     655              :         }
     656              :     }
     657              : 
     658              :     /* No luck, so switch to normal exclusive lock and do regular read */
     659          196 :     LWLockRelease(banklock);
     660          196 :     LWLockAcquire(banklock, LW_EXCLUSIVE);
     661              : 
     662          196 :     return SimpleLruReadPage(ctl, pageno, true, xid);
     663              : }
     664              : 
     665              : /*
     666              :  * Write a page from a shared buffer, if necessary.
     667              :  * Does nothing if the specified slot is not dirty.
     668              :  *
     669              :  * NOTE: only one write attempt is made here.  Hence, it is possible that
     670              :  * the page is still dirty at exit (if someone else re-dirtied it during
     671              :  * the write).  However, we *do* attempt a fresh write even if the page
     672              :  * is already being written; this is for checkpoints.
     673              :  *
     674              :  * Bank lock must be held at entry, and will be held at exit.
     675              :  */
     676              : static void
     677      7349499 : SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
     678              : {
     679      7349499 :     SlruShared  shared = ctl->shared;
     680      7349499 :     int64       pageno = shared->page_number[slotno];
     681      7349499 :     int         bankno = SlotGetBankNumber(slotno);
     682              :     bool        ok;
     683              : 
     684              :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     685              :     Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
     686              : 
     687              :     /* If a write is in progress, wait for it to finish */
     688      7349507 :     while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
     689            8 :            shared->page_number[slotno] == pageno)
     690              :     {
     691            8 :         SimpleLruWaitIO(ctl, slotno);
     692              :     }
     693              : 
     694              :     /*
     695              :      * Do nothing if page is not dirty, or if buffer no longer contains the
     696              :      * same page we were called for.
     697              :      */
     698      7349499 :     if (!shared->page_dirty[slotno] ||
     699      7346491 :         shared->page_status[slotno] != SLRU_PAGE_VALID ||
     700      7346491 :         shared->page_number[slotno] != pageno)
     701         3009 :         return;
     702              : 
     703              :     /*
     704              :      * Mark the slot write-busy, and clear the dirtybit.  After this point, a
     705              :      * transaction status update on this page will mark it dirty again.
     706              :      */
     707      7346490 :     shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
     708      7346490 :     shared->page_dirty[slotno] = false;
     709              : 
     710              :     /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
     711      7346490 :     LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
     712              : 
     713              :     /* Release bank lock while doing I/O */
     714      7346490 :     LWLockRelease(&shared->bank_locks[bankno].lock);
     715              : 
     716              :     /* Do the write */
     717      7346490 :     ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
     718              : 
     719              :     /* If we failed, and we're in a flush, better close the files */
     720      7346490 :     if (!ok && fdata)
     721              :     {
     722            0 :         for (int i = 0; i < fdata->num_files; i++)
     723            0 :             CloseTransientFile(fdata->fd[i]);
     724              :     }
     725              : 
     726              :     /* Re-acquire bank lock and update page state */
     727      7346490 :     LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
     728              : 
     729              :     Assert(shared->page_number[slotno] == pageno &&
     730              :            shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
     731              : 
     732              :     /* If we failed to write, mark the page dirty again */
     733      7346490 :     if (!ok)
     734            0 :         shared->page_dirty[slotno] = true;
     735              : 
     736      7346490 :     shared->page_status[slotno] = SLRU_PAGE_VALID;
     737              : 
     738      7346490 :     LWLockRelease(&shared->buffer_locks[slotno].lock);
     739              : 
     740              :     /* Now it's okay to ereport if we failed */
     741      7346490 :     if (!ok)
     742            0 :         SlruReportIOError(ctl, pageno, InvalidTransactionId);
     743              : 
     744              :     /* If part of a checkpoint, count this as a SLRU buffer written. */
     745      7346490 :     if (fdata)
     746              :     {
     747         2887 :         CheckpointStats.ckpt_slru_written++;
     748         2887 :         PendingCheckpointerStats.slru_written++;
     749              :     }
     750              : }
     751              : 
     752              : /*
     753              :  * Wrapper of SlruInternalWritePage, for external callers.
     754              :  * fdata is always passed a NULL here.
     755              :  */
     756              : void
     757          316 : SimpleLruWritePage(SlruCtl ctl, int slotno)
     758              : {
     759              :     Assert(ctl->shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     760              : 
     761          316 :     SlruInternalWritePage(ctl, slotno, NULL);
     762          316 : }
     763              : 
     764              : /*
     765              :  * Return whether the given page exists on disk.
     766              :  *
     767              :  * A false return means that either the file does not exist, or that it's not
     768              :  * large enough to contain the given page.
     769              :  */
     770              : bool
     771           66 : SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
     772              : {
     773           66 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     774           66 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     775           66 :     int         offset = rpageno * BLCKSZ;
     776              :     char        path[MAXPGPATH];
     777              :     int         fd;
     778              :     bool        result;
     779              :     off_t       endpos;
     780              : 
     781              :     /* update the stats counter of checked pages */
     782           66 :     pgstat_count_slru_blocks_exists(ctl->shared->slru_stats_idx);
     783              : 
     784           66 :     SlruFileName(ctl, path, segno);
     785              : 
     786           66 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
     787           66 :     if (fd < 0)
     788              :     {
     789              :         /* expected: file doesn't exist */
     790           26 :         if (errno == ENOENT)
     791           26 :             return false;
     792              : 
     793              :         /* report error normally */
     794            0 :         slru_errcause = SLRU_OPEN_FAILED;
     795            0 :         slru_errno = errno;
     796            0 :         SlruReportIOError(ctl, pageno, 0);
     797              :     }
     798              : 
     799           40 :     if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
     800              :     {
     801            0 :         slru_errcause = SLRU_SEEK_FAILED;
     802            0 :         slru_errno = errno;
     803            0 :         SlruReportIOError(ctl, pageno, 0);
     804              :     }
     805              : 
     806           40 :     result = endpos >= (off_t) (offset + BLCKSZ);
     807              : 
     808           40 :     if (CloseTransientFile(fd) != 0)
     809              :     {
     810            0 :         slru_errcause = SLRU_CLOSE_FAILED;
     811            0 :         slru_errno = errno;
     812            0 :         return false;
     813              :     }
     814              : 
     815           40 :     return result;
     816              : }
     817              : 
     818              : /*
     819              :  * Physical read of a (previously existing) page into a buffer slot
     820              :  *
     821              :  * On failure, we cannot just ereport(ERROR) since caller has put state in
     822              :  * shared memory that must be undone.  So, we return false and save enough
     823              :  * info in static variables to let SlruReportIOError make the report.
     824              :  *
     825              :  * For now, assume it's not worth keeping a file pointer open across
     826              :  * read/write operations.  We could cache one virtual file pointer ...
     827              :  */
     828              : static bool
     829        17546 : SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
     830              : {
     831        17546 :     SlruShared  shared = ctl->shared;
     832        17546 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     833        17546 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     834        17546 :     off_t       offset = rpageno * BLCKSZ;
     835              :     char        path[MAXPGPATH];
     836              :     int         fd;
     837              : 
     838        17546 :     SlruFileName(ctl, path, segno);
     839              : 
     840              :     /*
     841              :      * In a crash-and-restart situation, it's possible for us to receive
     842              :      * commands to set the commit status of transactions whose bits are in
     843              :      * already-truncated segments of the commit log (see notes in
     844              :      * SlruPhysicalWritePage).  Hence, if we are InRecovery, allow the case
     845              :      * where the file doesn't exist, and return zeroes instead.
     846              :      */
     847        17546 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
     848        17546 :     if (fd < 0)
     849              :     {
     850            0 :         if (errno != ENOENT || !InRecovery)
     851              :         {
     852            0 :             slru_errcause = SLRU_OPEN_FAILED;
     853            0 :             slru_errno = errno;
     854            0 :             return false;
     855              :         }
     856              : 
     857            0 :         ereport(LOG,
     858              :                 (errmsg("file \"%s\" doesn't exist, reading as zeroes",
     859              :                         path)));
     860            0 :         MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
     861            0 :         return true;
     862              :     }
     863              : 
     864        17546 :     errno = 0;
     865        17546 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
     866        17546 :     if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
     867              :     {
     868            0 :         pgstat_report_wait_end();
     869            0 :         slru_errcause = SLRU_READ_FAILED;
     870            0 :         slru_errno = errno;
     871            0 :         CloseTransientFile(fd);
     872            0 :         return false;
     873              :     }
     874        17546 :     pgstat_report_wait_end();
     875              : 
     876        17546 :     if (CloseTransientFile(fd) != 0)
     877              :     {
     878            0 :         slru_errcause = SLRU_CLOSE_FAILED;
     879            0 :         slru_errno = errno;
     880            0 :         return false;
     881              :     }
     882              : 
     883        17546 :     return true;
     884              : }
     885              : 
     886              : /*
     887              :  * Physical write of a page from a buffer slot
     888              :  *
     889              :  * On failure, we cannot just ereport(ERROR) since caller has put state in
     890              :  * shared memory that must be undone.  So, we return false and save enough
     891              :  * info in static variables to let SlruReportIOError make the report.
     892              :  *
     893              :  * For now, assume it's not worth keeping a file pointer open across
     894              :  * independent read/write operations.  We do batch operations during
     895              :  * SimpleLruWriteAll, though.
     896              :  *
     897              :  * fdata is NULL for a standalone write, pointer to open-file info during
     898              :  * SimpleLruWriteAll.
     899              :  */
     900              : static bool
     901      7346490 : SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
     902              : {
     903      7346490 :     SlruShared  shared = ctl->shared;
     904      7346490 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     905      7346490 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     906      7346490 :     off_t       offset = rpageno * BLCKSZ;
     907              :     char        path[MAXPGPATH];
     908      7346490 :     int         fd = -1;
     909              : 
     910              :     /* update the stats counter of written pages */
     911      7346490 :     pgstat_count_slru_blocks_written(shared->slru_stats_idx);
     912              : 
     913              :     /*
     914              :      * Honor the write-WAL-before-data rule, if appropriate, so that we do not
     915              :      * write out data before associated WAL records.  This is the same action
     916              :      * performed during FlushBuffer() in the main buffer manager.
     917              :      */
     918      7346490 :     if (shared->group_lsn != NULL)
     919              :     {
     920              :         /*
     921              :          * We must determine the largest async-commit LSN for the page. This
     922              :          * is a bit tedious, but since this entire function is a slow path
     923              :          * anyway, it seems better to do this here than to maintain a per-page
     924              :          * LSN variable (which'd need an extra comparison in the
     925              :          * transaction-commit path).
     926              :          */
     927              :         XLogRecPtr  max_lsn;
     928              :         int         lsnindex;
     929              : 
     930       433334 :         lsnindex = slotno * shared->lsn_groups_per_page;
     931       433334 :         max_lsn = shared->group_lsn[lsnindex++];
     932    443734016 :         for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
     933              :         {
     934    443300682 :             XLogRecPtr  this_lsn = shared->group_lsn[lsnindex++];
     935              : 
     936    443300682 :             if (max_lsn < this_lsn)
     937        44610 :                 max_lsn = this_lsn;
     938              :         }
     939              : 
     940       433334 :         if (XLogRecPtrIsValid(max_lsn))
     941              :         {
     942              :             /*
     943              :              * As noted above, elog(ERROR) is not acceptable here, so if
     944              :              * XLogFlush were to fail, we must PANIC.  This isn't much of a
     945              :              * restriction because XLogFlush is just about all critical
     946              :              * section anyway, but let's make sure.
     947              :              */
     948          523 :             START_CRIT_SECTION();
     949          523 :             XLogFlush(max_lsn);
     950          523 :             END_CRIT_SECTION();
     951              :         }
     952              :     }
     953              : 
     954              :     /*
     955              :      * During a SimpleLruWriteAll, we may already have the desired file open.
     956              :      */
     957      7346490 :     if (fdata)
     958              :     {
     959         2959 :         for (int i = 0; i < fdata->num_files; i++)
     960              :         {
     961          304 :             if (fdata->segno[i] == segno)
     962              :             {
     963          232 :                 fd = fdata->fd[i];
     964          232 :                 break;
     965              :             }
     966              :         }
     967              :     }
     968              : 
     969      7346490 :     if (fd < 0)
     970              :     {
     971              :         /*
     972              :          * If the file doesn't already exist, we should create it.  It is
     973              :          * possible for this to need to happen when writing a page that's not
     974              :          * first in its segment; we assume the OS can cope with that. (Note:
     975              :          * it might seem that it'd be okay to create files only when
     976              :          * SimpleLruZeroPage is called for the first page of a segment.
     977              :          * However, if after a crash and restart the REDO logic elects to
     978              :          * replay the log from a checkpoint before the latest one, then it's
     979              :          * possible that we will get commands to set transaction status of
     980              :          * transactions that have already been truncated from the commit log.
     981              :          * Easiest way to deal with that is to accept references to
     982              :          * nonexistent files here and in SlruPhysicalReadPage.)
     983              :          *
     984              :          * Note: it is possible for more than one backend to be executing this
     985              :          * code simultaneously for different pages of the same file. Hence,
     986              :          * don't use O_EXCL or O_TRUNC or anything like that.
     987              :          */
     988      7346258 :         SlruFileName(ctl, path, segno);
     989      7346258 :         fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
     990      7346258 :         if (fd < 0)
     991              :         {
     992            0 :             slru_errcause = SLRU_OPEN_FAILED;
     993            0 :             slru_errno = errno;
     994            0 :             return false;
     995              :         }
     996              : 
     997      7346258 :         if (fdata)
     998              :         {
     999         2655 :             if (fdata->num_files < MAX_WRITEALL_BUFFERS)
    1000              :             {
    1001         2655 :                 fdata->fd[fdata->num_files] = fd;
    1002         2655 :                 fdata->segno[fdata->num_files] = segno;
    1003         2655 :                 fdata->num_files++;
    1004              :             }
    1005              :             else
    1006              :             {
    1007              :                 /*
    1008              :                  * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
    1009              :                  * fall back to treating it as a standalone write.
    1010              :                  */
    1011            0 :                 fdata = NULL;
    1012              :             }
    1013              :         }
    1014              :     }
    1015              : 
    1016      7346490 :     errno = 0;
    1017      7346490 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
    1018      7346490 :     if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
    1019              :     {
    1020            0 :         pgstat_report_wait_end();
    1021              :         /* if write didn't set errno, assume problem is no disk space */
    1022            0 :         if (errno == 0)
    1023            0 :             errno = ENOSPC;
    1024            0 :         slru_errcause = SLRU_WRITE_FAILED;
    1025            0 :         slru_errno = errno;
    1026            0 :         if (!fdata)
    1027            0 :             CloseTransientFile(fd);
    1028            0 :         return false;
    1029              :     }
    1030      7346490 :     pgstat_report_wait_end();
    1031              : 
    1032              :     /* Queue up a sync request for the checkpointer. */
    1033      7346490 :     if (ctl->sync_handler != SYNC_HANDLER_NONE)
    1034              :     {
    1035              :         FileTag     tag;
    1036              : 
    1037       434204 :         INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
    1038       434204 :         if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
    1039              :         {
    1040              :             /* No space to enqueue sync request.  Do it synchronously. */
    1041            6 :             pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
    1042            6 :             if (pg_fsync(fd) != 0)
    1043              :             {
    1044            0 :                 pgstat_report_wait_end();
    1045            0 :                 slru_errcause = SLRU_FSYNC_FAILED;
    1046            0 :                 slru_errno = errno;
    1047            0 :                 CloseTransientFile(fd);
    1048            0 :                 return false;
    1049              :             }
    1050            6 :             pgstat_report_wait_end();
    1051              :         }
    1052              :     }
    1053              : 
    1054              :     /* Close file, unless part of flush request. */
    1055      7346490 :     if (!fdata)
    1056              :     {
    1057      7343603 :         if (CloseTransientFile(fd) != 0)
    1058              :         {
    1059            0 :             slru_errcause = SLRU_CLOSE_FAILED;
    1060            0 :             slru_errno = errno;
    1061            0 :             return false;
    1062              :         }
    1063              :     }
    1064              : 
    1065      7346490 :     return true;
    1066              : }
    1067              : 
    1068              : /*
    1069              :  * Issue the error message after failure of SlruPhysicalReadPage or
    1070              :  * SlruPhysicalWritePage.  Call this after cleaning up shared-memory state.
    1071              :  */
    1072              : static void
    1073            0 : SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
    1074              : {
    1075            0 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
    1076            0 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
    1077            0 :     int         offset = rpageno * BLCKSZ;
    1078              :     char        path[MAXPGPATH];
    1079              : 
    1080            0 :     SlruFileName(ctl, path, segno);
    1081            0 :     errno = slru_errno;
    1082            0 :     switch (slru_errcause)
    1083              :     {
    1084            0 :         case SLRU_OPEN_FAILED:
    1085            0 :             ereport(ERROR,
    1086              :                     (errcode_for_file_access(),
    1087              :                      errmsg("could not access status of transaction %u", xid),
    1088              :                      errdetail("Could not open file \"%s\": %m.", path)));
    1089              :             break;
    1090            0 :         case SLRU_SEEK_FAILED:
    1091            0 :             ereport(ERROR,
    1092              :                     (errcode_for_file_access(),
    1093              :                      errmsg("could not access status of transaction %u", xid),
    1094              :                      errdetail("Could not seek in file \"%s\" to offset %d: %m.",
    1095              :                                path, offset)));
    1096              :             break;
    1097            0 :         case SLRU_READ_FAILED:
    1098            0 :             if (errno)
    1099            0 :                 ereport(ERROR,
    1100              :                         (errcode_for_file_access(),
    1101              :                          errmsg("could not access status of transaction %u", xid),
    1102              :                          errdetail("Could not read from file \"%s\" at offset %d: %m.",
    1103              :                                    path, offset)));
    1104              :             else
    1105            0 :                 ereport(ERROR,
    1106              :                         (errmsg("could not access status of transaction %u", xid),
    1107              :                          errdetail("Could not read from file \"%s\" at offset %d: read too few bytes.", path, offset)));
    1108              :             break;
    1109            0 :         case SLRU_WRITE_FAILED:
    1110            0 :             if (errno)
    1111            0 :                 ereport(ERROR,
    1112              :                         (errcode_for_file_access(),
    1113              :                          errmsg("could not access status of transaction %u", xid),
    1114              :                          errdetail("Could not write to file \"%s\" at offset %d: %m.",
    1115              :                                    path, offset)));
    1116              :             else
    1117            0 :                 ereport(ERROR,
    1118              :                         (errmsg("could not access status of transaction %u", xid),
    1119              :                          errdetail("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
    1120              :                                    path, offset)));
    1121              :             break;
    1122            0 :         case SLRU_FSYNC_FAILED:
    1123            0 :             ereport(data_sync_elevel(ERROR),
    1124              :                     (errcode_for_file_access(),
    1125              :                      errmsg("could not access status of transaction %u", xid),
    1126              :                      errdetail("Could not fsync file \"%s\": %m.",
    1127              :                                path)));
    1128            0 :             break;
    1129            0 :         case SLRU_CLOSE_FAILED:
    1130            0 :             ereport(ERROR,
    1131              :                     (errcode_for_file_access(),
    1132              :                      errmsg("could not access status of transaction %u", xid),
    1133              :                      errdetail("Could not close file \"%s\": %m.",
    1134              :                                path)));
    1135              :             break;
    1136            0 :         default:
    1137              :             /* can't get here, we trust */
    1138            0 :             elog(ERROR, "unrecognized SimpleLru error cause: %d",
    1139              :                  (int) slru_errcause);
    1140              :             break;
    1141              :     }
    1142            0 : }
    1143              : 
    1144              : /*
    1145              :  * Mark a buffer slot "most recently used".
    1146              :  */
    1147              : static inline void
    1148      8524038 : SlruRecentlyUsed(SlruShared shared, int slotno)
    1149              : {
    1150      8524038 :     int         bankno = SlotGetBankNumber(slotno);
    1151      8524038 :     int         new_lru_count = shared->bank_cur_lru_count[bankno];
    1152              : 
    1153              :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
    1154              : 
    1155              :     /*
    1156              :      * The reason for the if-test is that there are often many consecutive
    1157              :      * accesses to the same page (particularly the latest page).  By
    1158              :      * suppressing useless increments of bank_cur_lru_count, we reduce the
    1159              :      * probability that old pages' counts will "wrap around" and make them
    1160              :      * appear recently used.
    1161              :      *
    1162              :      * We allow this code to be executed concurrently by multiple processes
    1163              :      * within SimpleLruReadPage_ReadOnly().  As long as int reads and writes
    1164              :      * are atomic, this should not cause any completely-bogus values to enter
    1165              :      * the computation.  However, it is possible for either bank_cur_lru_count
    1166              :      * or individual page_lru_count entries to be "reset" to lower values than
    1167              :      * they should have, in case a process is delayed while it executes this
    1168              :      * function.  With care in SlruSelectLRUPage(), this does little harm, and
    1169              :      * in any case the absolute worst possible consequence is a nonoptimal
    1170              :      * choice of page to evict.  The gain from allowing concurrent reads of
    1171              :      * SLRU pages seems worth it.
    1172              :      */
    1173      8524038 :     if (new_lru_count != shared->page_lru_count[slotno])
    1174              :     {
    1175      7483894 :         shared->bank_cur_lru_count[bankno] = ++new_lru_count;
    1176      7483894 :         shared->page_lru_count[slotno] = new_lru_count;
    1177              :     }
    1178      8524038 : }
    1179              : 
    1180              : /*
    1181              :  * Select the slot to re-use when we need a free slot for the given page.
    1182              :  *
    1183              :  * The target page number is passed not only because we need to know the
    1184              :  * correct bank to use, but also because we need to consider the possibility
    1185              :  * that some other process reads in the target page while we are doing I/O to
    1186              :  * free a slot.  Hence, check or recheck to see if any slot already holds the
    1187              :  * target page, and return that slot if so.  Thus, the returned slot is
    1188              :  * *either* a slot already holding the pageno (could be any state except
    1189              :  * EMPTY), *or* a freeable slot (state EMPTY or CLEAN).
    1190              :  *
    1191              :  * The correct bank lock must be held at entry, and will be held at exit.
    1192              :  */
    1193              : static int
    1194      7710972 : SlruSelectLRUPage(SlruCtl ctl, int64 pageno)
    1195              : {
    1196      7710972 :     SlruShared  shared = ctl->shared;
    1197              : 
    1198              :     /* Outer loop handles restart after I/O */
    1199              :     for (;;)
    1200      7343205 :     {
    1201              :         int         cur_count;
    1202     15054177 :         int         bestvalidslot = 0;  /* keep compiler quiet */
    1203     15054177 :         int         best_valid_delta = -1;
    1204     15054177 :         int64       best_valid_page_number = 0; /* keep compiler quiet */
    1205     15054177 :         int         bestinvalidslot = 0;    /* keep compiler quiet */
    1206     15054177 :         int         best_invalid_delta = -1;
    1207     15054177 :         int64       best_invalid_page_number = 0;   /* keep compiler quiet */
    1208     15054177 :         int         bankno = pageno % ctl->nbanks;
    1209     15054177 :         int         bankstart = bankno * SLRU_BANK_SIZE;
    1210     15054177 :         int         bankend = bankstart + SLRU_BANK_SIZE;
    1211              : 
    1212              :         Assert(LWLockHeldByMe(SimpleLruGetBankLock(ctl, pageno)));
    1213              : 
    1214              :         /* See if page already has a buffer assigned */
    1215    250994778 :         for (int slotno = bankstart; slotno < bankend; slotno++)
    1216              :         {
    1217    236288406 :             if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
    1218    236235597 :                 shared->page_number[slotno] == pageno)
    1219       347805 :                 return slotno;
    1220              :         }
    1221              : 
    1222              :         /*
    1223              :          * If we find any EMPTY slot, just select that one. Else choose a
    1224              :          * victim page to replace.  We normally take the least recently used
    1225              :          * valid page, but we will never take the slot containing
    1226              :          * latest_page_number, even if it appears least recently used.  We
    1227              :          * will select a slot that is already I/O busy only if there is no
    1228              :          * other choice: a read-busy slot will not be least recently used once
    1229              :          * the read finishes, and waiting for an I/O on a write-busy slot is
    1230              :          * inferior to just picking some other slot.  Testing shows the slot
    1231              :          * we pick instead will often be clean, allowing us to begin a read at
    1232              :          * once.
    1233              :          *
    1234              :          * Normally the page_lru_count values will all be different and so
    1235              :          * there will be a well-defined LRU page.  But since we allow
    1236              :          * concurrent execution of SlruRecentlyUsed() within
    1237              :          * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
    1238              :          * acquire the same lru_count values.  In that case we break ties by
    1239              :          * choosing the furthest-back page.
    1240              :          *
    1241              :          * Notice that this next line forcibly advances cur_lru_count to a
    1242              :          * value that is certainly beyond any value that will be in the
    1243              :          * page_lru_count array after the loop finishes.  This ensures that
    1244              :          * the next execution of SlruRecentlyUsed will mark the page newly
    1245              :          * used, even if it's for a page that has the current counter value.
    1246              :          * That gets us back on the path to having good data when there are
    1247              :          * multiple pages with the same lru_count.
    1248              :          */
    1249     14706372 :         cur_count = (shared->bank_cur_lru_count[bankno])++;
    1250    249955572 :         for (int slotno = bankstart; slotno < bankend; slotno++)
    1251              :         {
    1252              :             int         this_delta;
    1253              :             int64       this_page_number;
    1254              : 
    1255    235252688 :             if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1256         3488 :                 return slotno;
    1257              : 
    1258    235249200 :             this_delta = cur_count - shared->page_lru_count[slotno];
    1259    235249200 :             if (this_delta < 0)
    1260              :             {
    1261              :                 /*
    1262              :                  * Clean up in case shared updates have caused cur_count
    1263              :                  * increments to get "lost".  We back off the page counts,
    1264              :                  * rather than trying to increase cur_count, to avoid any
    1265              :                  * question of infinite loops or failure in the presence of
    1266              :                  * wrapped-around counts.
    1267              :                  */
    1268            0 :                 shared->page_lru_count[slotno] = cur_count;
    1269            0 :                 this_delta = 0;
    1270              :             }
    1271              : 
    1272              :             /*
    1273              :              * If this page is the one most recently zeroed, don't consider it
    1274              :              * an eviction candidate. See comments in SimpleLruZeroPage for an
    1275              :              * explanation about the lack of a memory barrier here.
    1276              :              */
    1277    235249200 :             this_page_number = shared->page_number[slotno];
    1278    235249200 :             if (this_page_number ==
    1279    235249200 :                 pg_atomic_read_u64(&shared->latest_page_number))
    1280         8885 :                 continue;
    1281              : 
    1282    235240315 :             if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1283              :             {
    1284    235240286 :                 if (this_delta > best_valid_delta ||
    1285            0 :                     (this_delta == best_valid_delta &&
    1286            0 :                      ctl->PagePrecedes(this_page_number,
    1287              :                                        best_valid_page_number)))
    1288              :                 {
    1289     30554017 :                     bestvalidslot = slotno;
    1290     30554017 :                     best_valid_delta = this_delta;
    1291     30554017 :                     best_valid_page_number = this_page_number;
    1292              :                 }
    1293              :             }
    1294              :             else
    1295              :             {
    1296           29 :                 if (this_delta > best_invalid_delta ||
    1297            0 :                     (this_delta == best_invalid_delta &&
    1298            0 :                      ctl->PagePrecedes(this_page_number,
    1299              :                                        best_invalid_page_number)))
    1300              :                 {
    1301           29 :                     bestinvalidslot = slotno;
    1302           29 :                     best_invalid_delta = this_delta;
    1303           29 :                     best_invalid_page_number = this_page_number;
    1304              :                 }
    1305              :             }
    1306              :         }
    1307              : 
    1308              :         /*
    1309              :          * If all pages (except possibly the latest one) are I/O busy, we'll
    1310              :          * have to wait for an I/O to complete and then retry.  In that
    1311              :          * unhappy case, we choose to wait for the I/O on the least recently
    1312              :          * used slot, on the assumption that it was likely initiated first of
    1313              :          * all the I/Os in progress and may therefore finish first.
    1314              :          */
    1315     14702884 :         if (best_valid_delta < 0)
    1316              :         {
    1317            0 :             SimpleLruWaitIO(ctl, bestinvalidslot);
    1318            0 :             continue;
    1319              :         }
    1320              : 
    1321              :         /*
    1322              :          * If the selected page is clean, we're set.
    1323              :          */
    1324     14702884 :         if (!shared->page_dirty[bestvalidslot])
    1325      7359679 :             return bestvalidslot;
    1326              : 
    1327              :         /*
    1328              :          * Write the page.
    1329              :          */
    1330      7343205 :         SlruInternalWritePage(ctl, bestvalidslot, NULL);
    1331              : 
    1332              :         /*
    1333              :          * Now loop back and try again.  This is the easiest way of dealing
    1334              :          * with corner cases such as the victim page being re-dirtied while we
    1335              :          * wrote it.
    1336              :          */
    1337              :     }
    1338              : }
    1339              : 
    1340              : /*
    1341              :  * Write dirty pages to disk during checkpoint or database shutdown.  Flushing
    1342              :  * is deferred until the next call to ProcessSyncRequests(), though we do fsync
    1343              :  * the containing directory here to make sure that newly created directory
    1344              :  * entries are on disk.
    1345              :  */
    1346              : void
    1347         9017 : SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
    1348              : {
    1349         9017 :     SlruShared  shared = ctl->shared;
    1350              :     SlruWriteAllData fdata;
    1351         9017 :     int64       pageno = 0;
    1352         9017 :     int         prevbank = SlotGetBankNumber(0);
    1353              :     bool        ok;
    1354              : 
    1355              :     /* update the stats counter of flushes */
    1356         9017 :     pgstat_count_slru_flush(shared->slru_stats_idx);
    1357              : 
    1358              :     /*
    1359              :      * Find and write dirty pages
    1360              :      */
    1361         9017 :     fdata.num_files = 0;
    1362              : 
    1363         9017 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1364              : 
    1365       218809 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1366              :     {
    1367       209792 :         int         curbank = SlotGetBankNumber(slotno);
    1368              : 
    1369              :         /*
    1370              :          * If the current bank lock is not same as the previous bank lock then
    1371              :          * release the previous lock and acquire the new lock.
    1372              :          */
    1373       209792 :         if (curbank != prevbank)
    1374              :         {
    1375         4095 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1376         4095 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1377         4095 :             prevbank = curbank;
    1378              :         }
    1379              : 
    1380              :         /* Do nothing if slot is unused */
    1381       209792 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1382       203896 :             continue;
    1383              : 
    1384         5896 :         SlruInternalWritePage(ctl, slotno, &fdata);
    1385              : 
    1386              :         /*
    1387              :          * In some places (e.g. checkpoints), we cannot assert that the slot
    1388              :          * is clean now, since another process might have re-dirtied it
    1389              :          * already.  That's okay.
    1390              :          */
    1391              :         Assert(allow_redirtied ||
    1392              :                shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
    1393              :                (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1394              :                 !shared->page_dirty[slotno]));
    1395              :     }
    1396              : 
    1397         9017 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1398              : 
    1399              :     /*
    1400              :      * Now close any files that were open
    1401              :      */
    1402         9017 :     ok = true;
    1403        11672 :     for (int i = 0; i < fdata.num_files; i++)
    1404              :     {
    1405         2655 :         if (CloseTransientFile(fdata.fd[i]) != 0)
    1406              :         {
    1407            0 :             slru_errcause = SLRU_CLOSE_FAILED;
    1408            0 :             slru_errno = errno;
    1409            0 :             pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
    1410            0 :             ok = false;
    1411              :         }
    1412              :     }
    1413         9017 :     if (!ok)
    1414            0 :         SlruReportIOError(ctl, pageno, InvalidTransactionId);
    1415              : 
    1416              :     /* Ensure that directory entries for new files are on disk. */
    1417         9017 :     if (ctl->sync_handler != SYNC_HANDLER_NONE)
    1418         7220 :         fsync_fname(ctl->Dir, true);
    1419         9017 : }
    1420              : 
    1421              : /*
    1422              :  * Remove all segments before the one holding the passed page number
    1423              :  *
    1424              :  * All SLRUs prevent concurrent calls to this function, either with an LWLock
    1425              :  * or by calling it only as part of a checkpoint.  Mutual exclusion must begin
    1426              :  * before computing cutoffPage.  Mutual exclusion must end after any limit
    1427              :  * update that would permit other backends to write fresh data into the
    1428              :  * segment immediately preceding the one containing cutoffPage.  Otherwise,
    1429              :  * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
    1430              :  * after it has accrued freshly-written data.
    1431              :  */
    1432              : void
    1433         1871 : SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
    1434              : {
    1435         1871 :     SlruShared  shared = ctl->shared;
    1436              :     int         prevbank;
    1437              : 
    1438              :     /* update the stats counter of truncates */
    1439         1871 :     pgstat_count_slru_truncate(shared->slru_stats_idx);
    1440              : 
    1441              :     /*
    1442              :      * Scan shared memory and remove any pages preceding the cutoff page, to
    1443              :      * ensure we won't rewrite them later.  (Since this is normally called in
    1444              :      * or just after a checkpoint, any dirty pages should have been flushed
    1445              :      * already ... we're just being extra careful here.)
    1446              :      */
    1447         1953 : restart:
    1448              : 
    1449              :     /*
    1450              :      * An important safety check: the current endpoint page must not be
    1451              :      * eligible for removal.  This check is just a backstop against wraparound
    1452              :      * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
    1453              :      * outdated value; therefore we don't add a memory barrier.
    1454              :      */
    1455         1953 :     if (ctl->PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number),
    1456              :                           cutoffPage))
    1457              :     {
    1458            0 :         ereport(LOG,
    1459              :                 (errmsg("could not truncate directory \"%s\": apparent wraparound",
    1460              :                         ctl->Dir)));
    1461            0 :         return;
    1462              :     }
    1463              : 
    1464         1953 :     prevbank = SlotGetBankNumber(0);
    1465         1953 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1466        46733 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1467              :     {
    1468        44862 :         int         curbank = SlotGetBankNumber(slotno);
    1469              : 
    1470              :         /*
    1471              :          * If the current bank lock is not same as the previous bank lock then
    1472              :          * release the previous lock and acquire the new lock.
    1473              :          */
    1474        44862 :         if (curbank != prevbank)
    1475              :         {
    1476          892 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1477          892 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1478          892 :             prevbank = curbank;
    1479              :         }
    1480              : 
    1481        44862 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1482        39455 :             continue;
    1483         5407 :         if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
    1484         5101 :             continue;
    1485              : 
    1486              :         /*
    1487              :          * If page is clean, just change state to EMPTY (expected case).
    1488              :          */
    1489          306 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1490          306 :             !shared->page_dirty[slotno])
    1491              :         {
    1492          224 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
    1493          224 :             continue;
    1494              :         }
    1495              : 
    1496              :         /*
    1497              :          * Hmm, we have (or may have) I/O operations acting on the page, so
    1498              :          * we've got to wait for them to finish and then start again. This is
    1499              :          * the same logic as in SlruSelectLRUPage.  (XXX if page is dirty,
    1500              :          * wouldn't it be OK to just discard it without writing it?
    1501              :          * SlruMayDeleteSegment() uses a stricter qualification, so we might
    1502              :          * not delete this page in the end; even if we don't delete it, we
    1503              :          * won't have cause to read its data again.  For now, keep the logic
    1504              :          * the same as it was.)
    1505              :          */
    1506           82 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1507           82 :             SlruInternalWritePage(ctl, slotno, NULL);
    1508              :         else
    1509            0 :             SimpleLruWaitIO(ctl, slotno);
    1510              : 
    1511           82 :         LWLockRelease(&shared->bank_locks[prevbank].lock);
    1512           82 :         goto restart;
    1513              :     }
    1514              : 
    1515         1871 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1516              : 
    1517              :     /* Now we can remove the old segment(s) */
    1518         1871 :     (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
    1519              : }
    1520              : 
    1521              : /*
    1522              :  * Delete an individual SLRU segment.
    1523              :  *
    1524              :  * NB: This does not touch the SLRU buffers themselves, callers have to ensure
    1525              :  * they either can't yet contain anything, or have already been cleaned out.
    1526              :  */
    1527              : static void
    1528       142060 : SlruInternalDeleteSegment(SlruCtl ctl, int64 segno)
    1529              : {
    1530              :     char        path[MAXPGPATH];
    1531              : 
    1532              :     /* Forget any fsync requests queued for this segment. */
    1533       142060 :     if (ctl->sync_handler != SYNC_HANDLER_NONE)
    1534              :     {
    1535              :         FileTag     tag;
    1536              : 
    1537        13281 :         INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
    1538        13281 :         RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
    1539              :     }
    1540              : 
    1541              :     /* Unlink the file. */
    1542       142060 :     SlruFileName(ctl, path, segno);
    1543       142060 :     ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
    1544       142060 :     unlink(path);
    1545       142060 : }
    1546              : 
    1547              : /*
    1548              :  * Delete an individual SLRU segment, identified by the segment number.
    1549              :  */
    1550              : void
    1551            2 : SlruDeleteSegment(SlruCtl ctl, int64 segno)
    1552              : {
    1553            2 :     SlruShared  shared = ctl->shared;
    1554            2 :     int         prevbank = SlotGetBankNumber(0);
    1555              :     bool        did_write;
    1556              : 
    1557              :     /* Clean out any possibly existing references to the segment. */
    1558            2 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1559            2 : restart:
    1560            2 :     did_write = false;
    1561           34 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1562              :     {
    1563              :         int64       pagesegno;
    1564           32 :         int         curbank = SlotGetBankNumber(slotno);
    1565              : 
    1566              :         /*
    1567              :          * If the current bank lock is not same as the previous bank lock then
    1568              :          * release the previous lock and acquire the new lock.
    1569              :          */
    1570           32 :         if (curbank != prevbank)
    1571              :         {
    1572            0 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1573            0 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1574            0 :             prevbank = curbank;
    1575              :         }
    1576              : 
    1577           32 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1578            0 :             continue;
    1579              : 
    1580           32 :         pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
    1581              :         /* not the segment we're looking for */
    1582           32 :         if (pagesegno != segno)
    1583            7 :             continue;
    1584              : 
    1585              :         /* If page is clean, just change state to EMPTY (expected case). */
    1586           25 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1587           25 :             !shared->page_dirty[slotno])
    1588              :         {
    1589           25 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
    1590           25 :             continue;
    1591              :         }
    1592              : 
    1593              :         /* Same logic as SimpleLruTruncate() */
    1594            0 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1595            0 :             SlruInternalWritePage(ctl, slotno, NULL);
    1596              :         else
    1597            0 :             SimpleLruWaitIO(ctl, slotno);
    1598              : 
    1599            0 :         did_write = true;
    1600              :     }
    1601              : 
    1602              :     /*
    1603              :      * Be extra careful and re-check. The IO functions release the control
    1604              :      * lock, so new pages could have been read in.
    1605              :      */
    1606            2 :     if (did_write)
    1607            0 :         goto restart;
    1608              : 
    1609            2 :     SlruInternalDeleteSegment(ctl, segno);
    1610              : 
    1611            2 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1612            2 : }
    1613              : 
    1614              : /*
    1615              :  * Determine whether a segment is okay to delete.
    1616              :  *
    1617              :  * segpage is the first page of the segment, and cutoffPage is the oldest (in
    1618              :  * PagePrecedes order) page in the SLRU containing still-useful data.  Since
    1619              :  * every core PagePrecedes callback implements "wrap around", check the
    1620              :  * segment's first and last pages:
    1621              :  *
    1622              :  * first<cutoff  && last<cutoff:  yes
    1623              :  * first<cutoff  && last>=cutoff: no; cutoff falls inside this segment
    1624              :  * first>=cutoff && last<cutoff:  no; wrap point falls inside this segment
    1625              :  * first>=cutoff && last>=cutoff: no; every page of this segment is too young
    1626              :  */
    1627              : static bool
    1628      1069142 : SlruMayDeleteSegment(SlruCtl ctl, int64 segpage, int64 cutoffPage)
    1629              : {
    1630      1069142 :     int64       seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
    1631              : 
    1632              :     Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
    1633              : 
    1634      1211909 :     return (ctl->PagePrecedes(segpage, cutoffPage) &&
    1635       142767 :             ctl->PagePrecedes(seg_last_page, cutoffPage));
    1636              : }
    1637              : 
    1638              : #ifdef USE_ASSERT_CHECKING
    1639              : static void
    1640              : SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
    1641              : {
    1642              :     TransactionId lhs,
    1643              :                 rhs;
    1644              :     int64       newestPage,
    1645              :                 oldestPage;
    1646              :     TransactionId newestXact,
    1647              :                 oldestXact;
    1648              : 
    1649              :     /*
    1650              :      * Compare an XID pair having undefined order (see RFC 1982), a pair at
    1651              :      * "opposite ends" of the XID space.  TransactionIdPrecedes() treats each
    1652              :      * as preceding the other.  If RHS is oldestXact, LHS is the first XID we
    1653              :      * must not assign.
    1654              :      */
    1655              :     lhs = per_page + offset;    /* skip first page to avoid non-normal XIDs */
    1656              :     rhs = lhs + (1U << 31);
    1657              :     Assert(TransactionIdPrecedes(lhs, rhs));
    1658              :     Assert(TransactionIdPrecedes(rhs, lhs));
    1659              :     Assert(!TransactionIdPrecedes(lhs - 1, rhs));
    1660              :     Assert(TransactionIdPrecedes(rhs, lhs - 1));
    1661              :     Assert(TransactionIdPrecedes(lhs + 1, rhs));
    1662              :     Assert(!TransactionIdPrecedes(rhs, lhs + 1));
    1663              :     Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
    1664              :     Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
    1665              :     Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
    1666              :     Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
    1667              :     Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
    1668              :     Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
    1669              :     Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
    1670              :     Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
    1671              :     Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
    1672              :            || (1U << 31) % per_page != 0);    /* See CommitTsPagePrecedes() */
    1673              :     Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
    1674              :            || (1U << 31) % per_page != 0);
    1675              :     Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
    1676              :     Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
    1677              :     Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
    1678              : 
    1679              :     /*
    1680              :      * GetNewTransactionId() has assigned the last XID it can safely use, and
    1681              :      * that XID is in the *LAST* page of the second segment.  We must not
    1682              :      * delete that segment.
    1683              :      */
    1684              :     newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
    1685              :     newestXact = newestPage * per_page + offset;
    1686              :     Assert(newestXact / per_page == newestPage);
    1687              :     oldestXact = newestXact + 1;
    1688              :     oldestXact -= 1U << 31;
    1689              :     oldestPage = oldestXact / per_page;
    1690              :     Assert(!SlruMayDeleteSegment(ctl,
    1691              :                                  (newestPage -
    1692              :                                   newestPage % SLRU_PAGES_PER_SEGMENT),
    1693              :                                  oldestPage));
    1694              : 
    1695              :     /*
    1696              :      * GetNewTransactionId() has assigned the last XID it can safely use, and
    1697              :      * that XID is in the *FIRST* page of the second segment.  We must not
    1698              :      * delete that segment.
    1699              :      */
    1700              :     newestPage = SLRU_PAGES_PER_SEGMENT;
    1701              :     newestXact = newestPage * per_page + offset;
    1702              :     Assert(newestXact / per_page == newestPage);
    1703              :     oldestXact = newestXact + 1;
    1704              :     oldestXact -= 1U << 31;
    1705              :     oldestPage = oldestXact / per_page;
    1706              :     Assert(!SlruMayDeleteSegment(ctl,
    1707              :                                  (newestPage -
    1708              :                                   newestPage % SLRU_PAGES_PER_SEGMENT),
    1709              :                                  oldestPage));
    1710              : }
    1711              : 
    1712              : /*
    1713              :  * Unit-test a PagePrecedes function.
    1714              :  *
    1715              :  * This assumes every uint32 >= FirstNormalTransactionId is a valid key.  It
    1716              :  * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
    1717              :  * (MultiXactMemberCtl separates flags from XIDs.  NotifyCtl has
    1718              :  * variable-length entries, no keys, and no random access.  These unit tests
    1719              :  * do not apply to them.)
    1720              :  */
    1721              : void
    1722              : SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
    1723              : {
    1724              :     /* Test first, middle and last entries of a page. */
    1725              :     SlruPagePrecedesTestOffset(ctl, per_page, 0);
    1726              :     SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
    1727              :     SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
    1728              : }
    1729              : #endif
    1730              : 
    1731              : /*
    1732              :  * SlruScanDirectory callback
    1733              :  *      This callback reports true if there's any segment wholly prior to the
    1734              :  *      one containing the page passed as "data".
    1735              :  */
    1736              : bool
    1737       846222 : SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int64 segpage,
    1738              :                             void *data)
    1739              : {
    1740       846222 :     int64       cutoffPage = *(int64 *) data;
    1741              : 
    1742       846222 :     if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
    1743          102 :         return true;            /* found one; don't iterate any more */
    1744              : 
    1745       846120 :     return false;               /* keep going */
    1746              : }
    1747              : 
    1748              : /*
    1749              :  * SlruScanDirectory callback.
    1750              :  *      This callback deletes segments prior to the one passed in as "data".
    1751              :  */
    1752              : static bool
    1753       222920 : SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int64 segpage,
    1754              :                           void *data)
    1755              : {
    1756       222920 :     int64       cutoffPage = *(int64 *) data;
    1757              : 
    1758       222920 :     if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
    1759       142050 :         SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
    1760              : 
    1761       222920 :     return false;               /* keep going */
    1762              : }
    1763              : 
    1764              : /*
    1765              :  * SlruScanDirectory callback.
    1766              :  *      This callback deletes all segments.
    1767              :  */
    1768              : bool
    1769            8 : SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int64 segpage, void *data)
    1770              : {
    1771            8 :     SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
    1772              : 
    1773            8 :     return false;               /* keep going */
    1774              : }
    1775              : 
    1776              : /*
    1777              :  * An internal function used by SlruScanDirectory().
    1778              :  *
    1779              :  * Returns true if a file with a name of a given length may be a correct
    1780              :  * SLRU segment.
    1781              :  */
    1782              : static inline bool
    1783      1081502 : SlruCorrectSegmentFilenameLength(SlruCtl ctl, size_t len)
    1784              : {
    1785      1081502 :     if (ctl->long_segment_names)
    1786         2315 :         return (len == 15);     /* see SlruFileName() */
    1787              :     else
    1788              : 
    1789              :         /*
    1790              :          * Commit 638cf09e76d allowed 5-character lengths. Later commit
    1791              :          * 73c986adde5 allowed 6-character length.
    1792              :          *
    1793              :          * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
    1794              :          * numbers, and the corresponding 15-character file names, which may
    1795              :          * eventually deprecate the support for 4, 5, and 6-character names.
    1796              :          */
    1797      1079187 :         return (len == 4 || len == 5 || len == 6);
    1798              : }
    1799              : 
    1800              : /*
    1801              :  * Scan the SimpleLru directory and apply a callback to each file found in it.
    1802              :  *
    1803              :  * If the callback returns true, the scan is stopped.  The last return value
    1804              :  * from the callback is returned.
    1805              :  *
    1806              :  * The callback receives the following arguments: 1. the SlruCtl struct for the
    1807              :  * slru being truncated; 2. the filename being considered; 3. the page number
    1808              :  * for the first page of that file; 4. a pointer to the opaque data given to us
    1809              :  * by the caller.
    1810              :  *
    1811              :  * Note that the ordering in which the directory is scanned is not guaranteed.
    1812              :  *
    1813              :  * Note that no locking is applied.
    1814              :  */
    1815              : bool
    1816         6226 : SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
    1817              : {
    1818         6226 :     bool        retval = false;
    1819              :     DIR        *cldir;
    1820              :     struct dirent *clde;
    1821              :     int64       segno;
    1822              :     int64       segpage;
    1823              : 
    1824         6226 :     cldir = AllocateDir(ctl->Dir);
    1825      1087626 :     while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
    1826              :     {
    1827              :         size_t      len;
    1828              : 
    1829      1081502 :         len = strlen(clde->d_name);
    1830              : 
    1831      1081502 :         if (SlruCorrectSegmentFilenameLength(ctl, len) &&
    1832      1069150 :             strspn(clde->d_name, "0123456789ABCDEF") == len)
    1833              :         {
    1834      1069150 :             segno = strtoi64(clde->d_name, NULL, 16);
    1835      1069150 :             segpage = segno * SLRU_PAGES_PER_SEGMENT;
    1836              : 
    1837      1069150 :             elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
    1838              :                  ctl->Dir, clde->d_name);
    1839      1069150 :             retval = callback(ctl, clde->d_name, segpage, data);
    1840      1069150 :             if (retval)
    1841          102 :                 break;
    1842              :         }
    1843              :     }
    1844         6226 :     FreeDir(cldir);
    1845              : 
    1846         6226 :     return retval;
    1847              : }
    1848              : 
    1849              : /*
    1850              :  * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
    1851              :  * that they can provide the correct "SlruCtl" (otherwise we don't know how to
    1852              :  * build the path), but they just forward to this common implementation that
    1853              :  * performs the fsync.
    1854              :  */
    1855              : int
    1856            2 : SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
    1857              : {
    1858              :     int         fd;
    1859              :     int         save_errno;
    1860              :     int         result;
    1861              : 
    1862            2 :     SlruFileName(ctl, path, ftag->segno);
    1863              : 
    1864            2 :     fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
    1865            2 :     if (fd < 0)
    1866            0 :         return -1;
    1867              : 
    1868            2 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
    1869            2 :     result = pg_fsync(fd);
    1870            2 :     pgstat_report_wait_end();
    1871            2 :     save_errno = errno;
    1872              : 
    1873            2 :     CloseTransientFile(fd);
    1874              : 
    1875            2 :     errno = save_errno;
    1876            2 :     return result;
    1877              : }
        

Generated by: LCOV version 2.0-1