LCOV - code coverage report
Current view: top level - src/backend/access/transam - slru.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 420 534 78.7 %
Date: 2025-09-10 22:18:18 Functions: 29 30 96.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * slru.c
       4             :  *      Simple LRU buffering for wrap-around-able permanent metadata
       5             :  *
       6             :  * This module is used to maintain various pieces of transaction status
       7             :  * indexed by TransactionId (such as commit status, parent transaction ID,
       8             :  * commit timestamp), as well as storage for multixacts, serializable
       9             :  * isolation locks and NOTIFY traffic.  Extensions can define their own
      10             :  * SLRUs, too.
      11             :  *
      12             :  * Under ordinary circumstances we expect that write traffic will occur
      13             :  * mostly to the latest page (and to the just-prior page, soon after a
      14             :  * page transition).  Read traffic will probably touch a larger span of
      15             :  * pages, but a relatively small number of buffers should be sufficient.
      16             :  *
      17             :  * We use a simple least-recently-used scheme to manage a pool of shared
      18             :  * page buffers, split in banks by the lowest bits of the page number, and
      19             :  * the management algorithm only processes the bank to which the desired
      20             :  * page belongs, so a linear search is sufficient; there's no need for a
      21             :  * hashtable or anything fancy.  The algorithm is straight LRU except that
      22             :  * we will never swap out the latest page (since we know it's going to be
      23             :  * hit again eventually).
      24             :  *
      25             :  * We use per-bank control LWLocks to protect the shared data structures,
      26             :  * plus per-buffer LWLocks that synchronize I/O for each buffer.  The
      27             :  * bank's control lock must be held to examine or modify any of the bank's
      28             :  * shared state.  A process that is reading in or writing out a page
      29             :  * buffer does not hold the control lock, only the per-buffer lock for the
      30             :  * buffer it is working on.  One exception is latest_page_number, which is
      31             :  * read and written using atomic ops.
      32             :  *
      33             :  * "Holding the bank control lock" means exclusive lock in all cases
      34             :  * except for SimpleLruReadPage_ReadOnly(); see comments for
      35             :  * SlruRecentlyUsed() for the implications of that.
      36             :  *
      37             :  * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
      38             :  * before releasing the control lock.  The per-buffer lock is released after
      39             :  * completing the I/O, re-acquiring the control lock, and updating the shared
      40             :  * state.  (Deadlock is not possible here, because we never try to initiate
      41             :  * I/O when someone else is already doing I/O on the same buffer.)
      42             :  * To wait for I/O to complete, release the control lock, acquire the
      43             :  * per-buffer lock in shared mode, immediately release the per-buffer lock,
      44             :  * reacquire the control lock, and then recheck state (since arbitrary things
      45             :  * could have happened while we didn't have the lock).
      46             :  *
      47             :  * As with the regular buffer manager, it is possible for another process
      48             :  * to re-dirty a page that is currently being written out.  This is handled
      49             :  * by re-setting the page's page_dirty flag.
      50             :  *
      51             :  *
      52             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      53             :  * Portions Copyright (c) 1994, Regents of the University of California
      54             :  *
      55             :  * src/backend/access/transam/slru.c
      56             :  *
      57             :  *-------------------------------------------------------------------------
      58             :  */
      59             : #include "postgres.h"
      60             : 
      61             : #include <fcntl.h>
      62             : #include <sys/stat.h>
      63             : #include <unistd.h>
      64             : 
      65             : #include "access/slru.h"
      66             : #include "access/transam.h"
      67             : #include "access/xlog.h"
      68             : #include "access/xlogutils.h"
      69             : #include "miscadmin.h"
      70             : #include "pgstat.h"
      71             : #include "storage/fd.h"
      72             : #include "storage/shmem.h"
      73             : #include "utils/guc.h"
      74             : 
      75             : /*
      76             :  * Converts segment number to the filename of the segment.
      77             :  *
      78             :  * "path" should point to a buffer at least MAXPGPATH characters long.
      79             :  *
      80             :  * If ctl->long_segment_names is true, segno can be in the range [0, 2^60-1].
      81             :  * The resulting file name is made of 15 characters, e.g. dir/123456789ABCDEF.
      82             :  *
      83             :  * If ctl->long_segment_names is false, segno can be in the range [0, 2^24-1].
      84             :  * The resulting file name is made of 4 to 6 characters, as of:
      85             :  *
      86             :  *  dir/1234   for [0, 2^16-1]
      87             :  *  dir/12345  for [2^16, 2^20-1]
      88             :  *  dir/123456 for [2^20, 2^24-1]
      89             :  */
      90             : static inline int
      91    14970026 : SlruFileName(SlruCtl ctl, char *path, int64 segno)
      92             : {
      93    14970026 :     if (ctl->long_segment_names)
      94             :     {
      95             :         /*
      96             :          * We could use 16 characters here but the disadvantage would be that
      97             :          * the SLRU segments will be hard to distinguish from WAL segments.
      98             :          *
      99             :          * For this reason we use 15 characters. It is enough but also means
     100             :          * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
     101             :          */
     102             :         Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
     103         294 :         return snprintf(path, MAXPGPATH, "%s/%015" PRIX64, ctl->Dir, segno);
     104             :     }
     105             :     else
     106             :     {
     107             :         /*
     108             :          * Despite the fact that %04X format string is used up to 24 bit
     109             :          * integers are allowed. See SlruCorrectSegmentFilenameLength()
     110             :          */
     111             :         Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
     112    14969732 :         return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir,
     113             :                         (unsigned int) segno);
     114             :     }
     115             : }
     116             : 
     117             : /*
     118             :  * During SimpleLruWriteAll(), we will usually not need to write more than one
     119             :  * or two physical files, but we may need to write several pages per file.  We
     120             :  * can consolidate the I/O requests by leaving files open until control returns
     121             :  * to SimpleLruWriteAll().  This data structure remembers which files are open.
     122             :  */
     123             : #define MAX_WRITEALL_BUFFERS    16
     124             : 
     125             : typedef struct SlruWriteAllData
     126             : {
     127             :     int         num_files;      /* # files actually open */
     128             :     int         fd[MAX_WRITEALL_BUFFERS];   /* their FD's */
     129             :     int64       segno[MAX_WRITEALL_BUFFERS];    /* their log seg#s */
     130             : } SlruWriteAllData;
     131             : 
     132             : typedef struct SlruWriteAllData *SlruWriteAll;
     133             : 
     134             : 
     135             : /*
     136             :  * Bank size for the slot array.  Pages are assigned a bank according to their
     137             :  * page number, with each bank being this size.  We want a power of 2 so that
     138             :  * we can determine the bank number for a page with just bit shifting; we also
     139             :  * want to keep the bank size small so that LRU victim search is fast.  16
     140             :  * buffers per bank seems a good number.
     141             :  */
     142             : #define SLRU_BANK_BITSHIFT      4
     143             : #define SLRU_BANK_SIZE          (1 << SLRU_BANK_BITSHIFT)
     144             : 
     145             : /*
     146             :  * Macro to get the bank number to which the slot belongs.
     147             :  */
     148             : #define SlotGetBankNumber(slotno)   ((slotno) >> SLRU_BANK_BITSHIFT)
     149             : 
     150             : 
     151             : /*
     152             :  * Populate a file tag describing a segment file.  We only use the segment
     153             :  * number, since we can derive everything else we need by having separate
     154             :  * sync handler functions for clog, multixact etc.
     155             :  */
     156             : #define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
     157             : ( \
     158             :     memset(&(a), 0, sizeof(FileTag)), \
     159             :     (a).handler = (xx_handler), \
     160             :     (a).segno = (xx_segno) \
     161             : )
     162             : 
     163             : /* Saved info for SlruReportIOError */
     164             : typedef enum
     165             : {
     166             :     SLRU_OPEN_FAILED,
     167             :     SLRU_SEEK_FAILED,
     168             :     SLRU_READ_FAILED,
     169             :     SLRU_WRITE_FAILED,
     170             :     SLRU_FSYNC_FAILED,
     171             :     SLRU_CLOSE_FAILED,
     172             : } SlruErrorCause;
     173             : 
     174             : static SlruErrorCause slru_errcause;
     175             : static int  slru_errno;
     176             : 
     177             : 
     178             : static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
     179             : static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
     180             : static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
     181             : static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
     182             : static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno,
     183             :                                   SlruWriteAll fdata);
     184             : static void SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid);
     185             : static int  SlruSelectLRUPage(SlruCtl ctl, int64 pageno);
     186             : 
     187             : static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
     188             :                                       int64 segpage, void *data);
     189             : static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
     190             : static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
     191             : 
     192             : 
     193             : /*
     194             :  * Initialization of shared memory
     195             :  */
     196             : 
     197             : Size
     198       43562 : SimpleLruShmemSize(int nslots, int nlsns)
     199             : {
     200       43562 :     int         nbanks = nslots / SLRU_BANK_SIZE;
     201             :     Size        sz;
     202             : 
     203             :     Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
     204             :     Assert(nslots % SLRU_BANK_SIZE == 0);
     205             : 
     206             :     /* we assume nslots isn't so large as to risk overflow */
     207       43562 :     sz = MAXALIGN(sizeof(SlruSharedData));
     208       43562 :     sz += MAXALIGN(nslots * sizeof(char *));    /* page_buffer[] */
     209       43562 :     sz += MAXALIGN(nslots * sizeof(SlruPageStatus));    /* page_status[] */
     210       43562 :     sz += MAXALIGN(nslots * sizeof(bool));  /* page_dirty[] */
     211       43562 :     sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
     212       43562 :     sz += MAXALIGN(nslots * sizeof(int));   /* page_lru_count[] */
     213       43562 :     sz += MAXALIGN(nslots * sizeof(LWLockPadded));  /* buffer_locks[] */
     214       43562 :     sz += MAXALIGN(nbanks * sizeof(LWLockPadded));  /* bank_locks[] */
     215       43562 :     sz += MAXALIGN(nbanks * sizeof(int));   /* bank_cur_lru_count[] */
     216             : 
     217       43562 :     if (nlsns > 0)
     218        6222 :         sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));    /* group_lsn[] */
     219             : 
     220       43562 :     return BUFFERALIGN(sz) + BLCKSZ * nslots;
     221             : }
     222             : 
     223             : /*
     224             :  * Determine a number of SLRU buffers to use.
     225             :  *
     226             :  * We simply divide shared_buffers by the divisor given and cap
     227             :  * that at the maximum given; but always at least SLRU_BANK_SIZE.
     228             :  * Round down to the nearest multiple of SLRU_BANK_SIZE.
     229             :  */
     230             : int
     231       18564 : SimpleLruAutotuneBuffers(int divisor, int max)
     232             : {
     233       18564 :     return Min(max - (max % SLRU_BANK_SIZE),
     234             :                Max(SLRU_BANK_SIZE,
     235             :                    NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE));
     236             : }
     237             : 
     238             : /*
     239             :  * Initialize, or attach to, a simple LRU cache in shared memory.
     240             :  *
     241             :  * ctl: address of local (unshared) control structure.
     242             :  * name: name of SLRU.  (This is user-visible, pick with care!)
     243             :  * nslots: number of page slots to use.
     244             :  * nlsns: number of LSN groups per page (set to zero if not relevant).
     245             :  * subdir: PGDATA-relative subdirectory that will contain the files.
     246             :  * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks.
     247             :  * bank_tranche_id: tranche ID to use for the bank LWLocks.
     248             :  * sync_handler: which set of functions to use to handle sync requests
     249             :  * long_segment_names: use short or long segment names
     250             :  */
     251             : void
     252       15222 : SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
     253             :               const char *subdir, int buffer_tranche_id, int bank_tranche_id,
     254             :               SyncRequestHandler sync_handler, bool long_segment_names)
     255             : {
     256             :     SlruShared  shared;
     257             :     bool        found;
     258       15222 :     int         nbanks = nslots / SLRU_BANK_SIZE;
     259             : 
     260             :     Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
     261             : 
     262       15222 :     shared = (SlruShared) ShmemInitStruct(name,
     263             :                                           SimpleLruShmemSize(nslots, nlsns),
     264             :                                           &found);
     265             : 
     266       15222 :     if (!IsUnderPostmaster)
     267             :     {
     268             :         /* Initialize locks and shared memory area */
     269             :         char       *ptr;
     270             :         Size        offset;
     271             : 
     272             :         Assert(!found);
     273             : 
     274       15222 :         memset(shared, 0, sizeof(SlruSharedData));
     275             : 
     276       15222 :         shared->num_slots = nslots;
     277       15222 :         shared->lsn_groups_per_page = nlsns;
     278             : 
     279       15222 :         pg_atomic_init_u64(&shared->latest_page_number, 0);
     280             : 
     281       15222 :         shared->slru_stats_idx = pgstat_get_slru_index(name);
     282             : 
     283       15222 :         ptr = (char *) shared;
     284       15222 :         offset = MAXALIGN(sizeof(SlruSharedData));
     285       15222 :         shared->page_buffer = (char **) (ptr + offset);
     286       15222 :         offset += MAXALIGN(nslots * sizeof(char *));
     287       15222 :         shared->page_status = (SlruPageStatus *) (ptr + offset);
     288       15222 :         offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
     289       15222 :         shared->page_dirty = (bool *) (ptr + offset);
     290       15222 :         offset += MAXALIGN(nslots * sizeof(bool));
     291       15222 :         shared->page_number = (int64 *) (ptr + offset);
     292       15222 :         offset += MAXALIGN(nslots * sizeof(int64));
     293       15222 :         shared->page_lru_count = (int *) (ptr + offset);
     294       15222 :         offset += MAXALIGN(nslots * sizeof(int));
     295             : 
     296             :         /* Initialize LWLocks */
     297       15222 :         shared->buffer_locks = (LWLockPadded *) (ptr + offset);
     298       15222 :         offset += MAXALIGN(nslots * sizeof(LWLockPadded));
     299       15222 :         shared->bank_locks = (LWLockPadded *) (ptr + offset);
     300       15222 :         offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
     301       15222 :         shared->bank_cur_lru_count = (int *) (ptr + offset);
     302       15222 :         offset += MAXALIGN(nbanks * sizeof(int));
     303             : 
     304       15222 :         if (nlsns > 0)
     305             :         {
     306        2174 :             shared->group_lsn = (XLogRecPtr *) (ptr + offset);
     307        2174 :             offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
     308             :         }
     309             : 
     310       15222 :         ptr += BUFFERALIGN(offset);
     311      387318 :         for (int slotno = 0; slotno < nslots; slotno++)
     312             :         {
     313      372096 :             LWLockInitialize(&shared->buffer_locks[slotno].lock,
     314             :                              buffer_tranche_id);
     315             : 
     316      372096 :             shared->page_buffer[slotno] = ptr;
     317      372096 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
     318      372096 :             shared->page_dirty[slotno] = false;
     319      372096 :             shared->page_lru_count[slotno] = 0;
     320      372096 :             ptr += BLCKSZ;
     321             :         }
     322             : 
     323             :         /* Initialize the slot banks. */
     324       38478 :         for (int bankno = 0; bankno < nbanks; bankno++)
     325             :         {
     326       23256 :             LWLockInitialize(&shared->bank_locks[bankno].lock, bank_tranche_id);
     327       23256 :             shared->bank_cur_lru_count[bankno] = 0;
     328             :         }
     329             : 
     330             :         /* Should fit to estimated shmem size */
     331             :         Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
     332             :     }
     333             :     else
     334             :     {
     335             :         Assert(found);
     336             :         Assert(shared->num_slots == nslots);
     337             :     }
     338             : 
     339             :     /*
     340             :      * Initialize the unshared control struct, including directory path. We
     341             :      * assume caller set PagePrecedes.
     342             :      */
     343       15222 :     ctl->shared = shared;
     344       15222 :     ctl->sync_handler = sync_handler;
     345       15222 :     ctl->long_segment_names = long_segment_names;
     346       15222 :     ctl->nbanks = nbanks;
     347       15222 :     strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
     348       15222 : }
     349             : 
     350             : /*
     351             :  * Helper function for GUC check_hook to check whether slru buffers are in
     352             :  * multiples of SLRU_BANK_SIZE.
     353             :  */
     354             : bool
     355       22266 : check_slru_buffers(const char *name, int *newval)
     356             : {
     357             :     /* Valid values are multiples of SLRU_BANK_SIZE */
     358       22266 :     if (*newval % SLRU_BANK_SIZE == 0)
     359       22266 :         return true;
     360             : 
     361           0 :     GUC_check_errdetail("\"%s\" must be a multiple of %d.", name,
     362             :                         SLRU_BANK_SIZE);
     363           0 :     return false;
     364             : }
     365             : 
     366             : /*
     367             :  * Initialize (or reinitialize) a page to zeroes.
     368             :  *
     369             :  * The page is not actually written, just set up in shared memory.
     370             :  * The slot number of the new page is returned.
     371             :  *
     372             :  * Bank lock must be held at entry, and will be held at exit.
     373             :  */
     374             : int
     375    14679210 : SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
     376             : {
     377    14679210 :     SlruShared  shared = ctl->shared;
     378             :     int         slotno;
     379             : 
     380             :     Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
     381             : 
     382             :     /* Find a suitable buffer slot for the page */
     383    14679210 :     slotno = SlruSelectLRUPage(ctl, pageno);
     384             :     Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
     385             :            (shared->page_status[slotno] == SLRU_PAGE_VALID &&
     386             :             !shared->page_dirty[slotno]) ||
     387             :            shared->page_number[slotno] == pageno);
     388             : 
     389             :     /* Mark the slot as containing this page */
     390    14679210 :     shared->page_number[slotno] = pageno;
     391    14679210 :     shared->page_status[slotno] = SLRU_PAGE_VALID;
     392    14679210 :     shared->page_dirty[slotno] = true;
     393    14679210 :     SlruRecentlyUsed(shared, slotno);
     394             : 
     395             :     /* Set the buffer to zeroes */
     396    14679210 :     MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
     397             : 
     398             :     /* Set the LSNs for this new page to zero */
     399    14679210 :     SimpleLruZeroLSNs(ctl, slotno);
     400             : 
     401             :     /*
     402             :      * Assume this page is now the latest active page.
     403             :      *
     404             :      * Note that because both this routine and SlruSelectLRUPage run with a
     405             :      * SLRU bank lock held, it is not possible for this to be zeroing a page
     406             :      * that SlruSelectLRUPage is going to evict simultaneously.  Therefore,
     407             :      * there's no memory barrier here.
     408             :      */
     409    14679210 :     pg_atomic_write_u64(&shared->latest_page_number, pageno);
     410             : 
     411             :     /* update the stats counter of zeroed pages */
     412    14679210 :     pgstat_count_slru_blocks_zeroed(shared->slru_stats_idx);
     413             : 
     414    14679210 :     return slotno;
     415             : }
     416             : 
     417             : /*
     418             :  * Zero all the LSNs we store for this slru page.
     419             :  *
     420             :  * This should be called each time we create a new page, and each time we read
     421             :  * in a page from disk into an existing buffer.  (Such an old page cannot
     422             :  * have any interesting LSNs, since we'd have flushed them before writing
     423             :  * the page in the first place.)
     424             :  *
     425             :  * This assumes that InvalidXLogRecPtr is bitwise-all-0.
     426             :  */
     427             : static void
     428    14683296 : SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
     429             : {
     430    14683296 :     SlruShared  shared = ctl->shared;
     431             : 
     432    14683296 :     if (shared->lsn_groups_per_page > 0)
     433      865794 :         MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
     434             :                shared->lsn_groups_per_page * sizeof(XLogRecPtr));
     435    14683296 : }
     436             : 
     437             : /*
     438             :  * This is a convenience wrapper for the common case of zeroing a page and
     439             :  * immediately flushing it to disk.
     440             :  *
     441             :  * SLRU bank lock is acquired and released here.
     442             :  */
     443             : void
     444         432 : SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno)
     445             : {
     446             :     int         slotno;
     447             :     LWLock     *lock;
     448             : 
     449         432 :     lock = SimpleLruGetBankLock(ctl, pageno);
     450         432 :     LWLockAcquire(lock, LW_EXCLUSIVE);
     451             : 
     452             :     /* Create and zero the page */
     453         432 :     slotno = SimpleLruZeroPage(ctl, pageno);
     454             : 
     455             :     /* Make sure it's written out */
     456         432 :     SimpleLruWritePage(ctl, slotno);
     457             :     Assert(!ctl->shared->page_dirty[slotno]);
     458             : 
     459         432 :     LWLockRelease(lock);
     460         432 : }
     461             : 
     462             : /*
     463             :  * Wait for any active I/O on a page slot to finish.  (This does not
     464             :  * guarantee that new I/O hasn't been started before we return, though.
     465             :  * In fact the slot might not even contain the same page anymore.)
     466             :  *
     467             :  * Bank lock must be held at entry, and will be held at exit.
     468             :  */
     469             : static void
     470           4 : SimpleLruWaitIO(SlruCtl ctl, int slotno)
     471             : {
     472           4 :     SlruShared  shared = ctl->shared;
     473           4 :     int         bankno = SlotGetBankNumber(slotno);
     474             : 
     475             :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     476             : 
     477             :     /* See notes at top of file */
     478           4 :     LWLockRelease(&shared->bank_locks[bankno].lock);
     479           4 :     LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
     480           4 :     LWLockRelease(&shared->buffer_locks[slotno].lock);
     481           4 :     LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
     482             : 
     483             :     /*
     484             :      * If the slot is still in an io-in-progress state, then either someone
     485             :      * already started a new I/O on the slot, or a previous I/O failed and
     486             :      * neglected to reset the page state.  That shouldn't happen, really, but
     487             :      * it seems worth a few extra cycles to check and recover from it. We can
     488             :      * cheaply test for failure by seeing if the buffer lock is still held (we
     489             :      * assume that transaction abort would release the lock).
     490             :      */
     491           4 :     if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
     492           4 :         shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
     493             :     {
     494           0 :         if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
     495             :         {
     496             :             /* indeed, the I/O must have failed */
     497           0 :             if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
     498           0 :                 shared->page_status[slotno] = SLRU_PAGE_EMPTY;
     499             :             else                /* write_in_progress */
     500             :             {
     501           0 :                 shared->page_status[slotno] = SLRU_PAGE_VALID;
     502           0 :                 shared->page_dirty[slotno] = true;
     503             :             }
     504           0 :             LWLockRelease(&shared->buffer_locks[slotno].lock);
     505             :         }
     506             :     }
     507           4 : }
     508             : 
     509             : /*
     510             :  * Find a page in a shared buffer, reading it in if necessary.
     511             :  * The page number must correspond to an already-initialized page.
     512             :  *
     513             :  * If write_ok is true then it is OK to return a page that is in
     514             :  * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
     515             :  * that modification of the page is safe.  If write_ok is false then we
     516             :  * will not return the page until it is not undergoing active I/O.
     517             :  *
     518             :  * The passed-in xid is used only for error reporting, and may be
     519             :  * InvalidTransactionId if no specific xid is associated with the action.
     520             :  *
     521             :  * Return value is the shared-buffer slot number now holding the page.
     522             :  * The buffer's LRU access info is updated.
     523             :  *
     524             :  * The correct bank lock must be held at entry, and will be held at exit.
     525             :  */
     526             : int
     527      328388 : SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
     528             :                   TransactionId xid)
     529             : {
     530      328388 :     SlruShared  shared = ctl->shared;
     531      328388 :     LWLock     *banklock = SimpleLruGetBankLock(ctl, pageno);
     532             : 
     533             :     Assert(LWLockHeldByMeInMode(banklock, LW_EXCLUSIVE));
     534             : 
     535             :     /* Outer loop handles restart if we must wait for someone else's I/O */
     536             :     for (;;)
     537           0 :     {
     538             :         int         slotno;
     539             :         bool        ok;
     540             : 
     541             :         /* See if page already is in memory; if not, pick victim slot */
     542      328388 :         slotno = SlruSelectLRUPage(ctl, pageno);
     543             : 
     544             :         /* Did we find the page in memory? */
     545      328388 :         if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
     546      324928 :             shared->page_number[slotno] == pageno)
     547             :         {
     548             :             /*
     549             :              * If page is still being read in, we must wait for I/O.  Likewise
     550             :              * if the page is being written and the caller said that's not OK.
     551             :              */
     552      324302 :             if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
     553      324302 :                 (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
     554           0 :                  !write_ok))
     555             :             {
     556           0 :                 SimpleLruWaitIO(ctl, slotno);
     557             :                 /* Now we must recheck state from the top */
     558           0 :                 continue;
     559             :             }
     560             :             /* Otherwise, it's ready to use */
     561      324302 :             SlruRecentlyUsed(shared, slotno);
     562             : 
     563             :             /* update the stats counter of pages found in the SLRU */
     564      324302 :             pgstat_count_slru_blocks_hit(shared->slru_stats_idx);
     565             : 
     566      324302 :             return slotno;
     567             :         }
     568             : 
     569             :         /* We found no match; assert we selected a freeable slot */
     570             :         Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
     571             :                (shared->page_status[slotno] == SLRU_PAGE_VALID &&
     572             :                 !shared->page_dirty[slotno]));
     573             : 
     574             :         /* Mark the slot read-busy */
     575        4086 :         shared->page_number[slotno] = pageno;
     576        4086 :         shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
     577        4086 :         shared->page_dirty[slotno] = false;
     578             : 
     579             :         /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
     580        4086 :         LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
     581             : 
     582             :         /* Release bank lock while doing I/O */
     583        4086 :         LWLockRelease(banklock);
     584             : 
     585             :         /* Do the read */
     586        4086 :         ok = SlruPhysicalReadPage(ctl, pageno, slotno);
     587             : 
     588             :         /* Set the LSNs for this newly read-in page to zero */
     589        4086 :         SimpleLruZeroLSNs(ctl, slotno);
     590             : 
     591             :         /* Re-acquire bank control lock and update page state */
     592        4086 :         LWLockAcquire(banklock, LW_EXCLUSIVE);
     593             : 
     594             :         Assert(shared->page_number[slotno] == pageno &&
     595             :                shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
     596             :                !shared->page_dirty[slotno]);
     597             : 
     598        4086 :         shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
     599             : 
     600        4086 :         LWLockRelease(&shared->buffer_locks[slotno].lock);
     601             : 
     602             :         /* Now it's okay to ereport if we failed */
     603        4086 :         if (!ok)
     604           0 :             SlruReportIOError(ctl, pageno, xid);
     605             : 
     606        4086 :         SlruRecentlyUsed(shared, slotno);
     607             : 
     608             :         /* update the stats counter of pages not found in SLRU */
     609        4086 :         pgstat_count_slru_blocks_read(shared->slru_stats_idx);
     610             : 
     611        4086 :         return slotno;
     612             :     }
     613             : }
     614             : 
     615             : /*
     616             :  * Find a page in a shared buffer, reading it in if necessary.
     617             :  * The page number must correspond to an already-initialized page.
     618             :  * The caller must intend only read-only access to the page.
     619             :  *
     620             :  * The passed-in xid is used only for error reporting, and may be
     621             :  * InvalidTransactionId if no specific xid is associated with the action.
     622             :  *
     623             :  * Return value is the shared-buffer slot number now holding the page.
     624             :  * The buffer's LRU access info is updated.
     625             :  *
     626             :  * Bank control lock must NOT be held at entry, but will be held at exit.
     627             :  * It is unspecified whether the lock will be shared or exclusive.
     628             :  */
     629             : int
     630     1379716 : SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
     631             : {
     632     1379716 :     SlruShared  shared = ctl->shared;
     633     1379716 :     LWLock     *banklock = SimpleLruGetBankLock(ctl, pageno);
     634     1379716 :     int         bankno = pageno % ctl->nbanks;
     635     1379716 :     int         bankstart = bankno * SLRU_BANK_SIZE;
     636     1379716 :     int         bankend = bankstart + SLRU_BANK_SIZE;
     637             : 
     638             :     /* Try to find the page while holding only shared lock */
     639     1379716 :     LWLockAcquire(banklock, LW_SHARED);
     640             : 
     641             :     /* See if page is already in a buffer */
     642     1393450 :     for (int slotno = bankstart; slotno < bankend; slotno++)
     643             :     {
     644     1392954 :         if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
     645     1391280 :             shared->page_number[slotno] == pageno &&
     646     1379220 :             shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
     647             :         {
     648             :             /* See comments for SlruRecentlyUsed() */
     649     1379220 :             SlruRecentlyUsed(shared, slotno);
     650             : 
     651             :             /* update the stats counter of pages found in the SLRU */
     652     1379220 :             pgstat_count_slru_blocks_hit(shared->slru_stats_idx);
     653             : 
     654     1379220 :             return slotno;
     655             :         }
     656             :     }
     657             : 
     658             :     /* No luck, so switch to normal exclusive lock and do regular read */
     659         496 :     LWLockRelease(banklock);
     660         496 :     LWLockAcquire(banklock, LW_EXCLUSIVE);
     661             : 
     662         496 :     return SimpleLruReadPage(ctl, pageno, true, xid);
     663             : }
     664             : 
     665             : /*
     666             :  * Write a page from a shared buffer, if necessary.
     667             :  * Does nothing if the specified slot is not dirty.
     668             :  *
     669             :  * NOTE: only one write attempt is made here.  Hence, it is possible that
     670             :  * the page is still dirty at exit (if someone else re-dirtied it during
     671             :  * the write).  However, we *do* attempt a fresh write even if the page
     672             :  * is already being written; this is for checkpoints.
     673             :  *
     674             :  * Bank lock must be held at entry, and will be held at exit.
     675             :  */
     676             : static void
     677    14687684 : SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
     678             : {
     679    14687684 :     SlruShared  shared = ctl->shared;
     680    14687684 :     int64       pageno = shared->page_number[slotno];
     681    14687684 :     int         bankno = SlotGetBankNumber(slotno);
     682             :     bool        ok;
     683             : 
     684             :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     685             :     Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
     686             : 
     687             :     /* If a write is in progress, wait for it to finish */
     688    14687688 :     while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
     689           4 :            shared->page_number[slotno] == pageno)
     690             :     {
     691           4 :         SimpleLruWaitIO(ctl, slotno);
     692             :     }
     693             : 
     694             :     /*
     695             :      * Do nothing if page is not dirty, or if buffer no longer contains the
     696             :      * same page we were called for.
     697             :      */
     698    14687684 :     if (!shared->page_dirty[slotno] ||
     699    14682176 :         shared->page_status[slotno] != SLRU_PAGE_VALID ||
     700    14682176 :         shared->page_number[slotno] != pageno)
     701        5508 :         return;
     702             : 
     703             :     /*
     704             :      * Mark the slot write-busy, and clear the dirtybit.  After this point, a
     705             :      * transaction status update on this page will mark it dirty again.
     706             :      */
     707    14682176 :     shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
     708    14682176 :     shared->page_dirty[slotno] = false;
     709             : 
     710             :     /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
     711    14682176 :     LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
     712             : 
     713             :     /* Release bank lock while doing I/O */
     714    14682176 :     LWLockRelease(&shared->bank_locks[bankno].lock);
     715             : 
     716             :     /* Do the write */
     717    14682176 :     ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
     718             : 
     719             :     /* If we failed, and we're in a flush, better close the files */
     720    14682176 :     if (!ok && fdata)
     721             :     {
     722           0 :         for (int i = 0; i < fdata->num_files; i++)
     723           0 :             CloseTransientFile(fdata->fd[i]);
     724             :     }
     725             : 
     726             :     /* Re-acquire bank lock and update page state */
     727    14682176 :     LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
     728             : 
     729             :     Assert(shared->page_number[slotno] == pageno &&
     730             :            shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
     731             : 
     732             :     /* If we failed to write, mark the page dirty again */
     733    14682176 :     if (!ok)
     734           0 :         shared->page_dirty[slotno] = true;
     735             : 
     736    14682176 :     shared->page_status[slotno] = SLRU_PAGE_VALID;
     737             : 
     738    14682176 :     LWLockRelease(&shared->buffer_locks[slotno].lock);
     739             : 
     740             :     /* Now it's okay to ereport if we failed */
     741    14682176 :     if (!ok)
     742           0 :         SlruReportIOError(ctl, pageno, InvalidTransactionId);
     743             : 
     744             :     /* If part of a checkpoint, count this as a SLRU buffer written. */
     745    14682176 :     if (fdata)
     746             :     {
     747        5550 :         CheckpointStats.ckpt_slru_written++;
     748        5550 :         PendingCheckpointerStats.slru_written++;
     749             :     }
     750             : }
     751             : 
     752             : /*
     753             :  * Wrapper of SlruInternalWritePage, for external callers.
     754             :  * fdata is always passed a NULL here.
     755             :  */
     756             : void
     757         628 : SimpleLruWritePage(SlruCtl ctl, int slotno)
     758             : {
     759             :     Assert(ctl->shared->page_status[slotno] != SLRU_PAGE_EMPTY);
     760             : 
     761         628 :     SlruInternalWritePage(ctl, slotno, NULL);
     762         628 : }
     763             : 
     764             : /*
     765             :  * Return whether the given page exists on disk.
     766             :  *
     767             :  * A false return means that either the file does not exist, or that it's not
     768             :  * large enough to contain the given page.
     769             :  */
     770             : bool
     771         214 : SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
     772             : {
     773         214 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     774         214 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     775         214 :     int         offset = rpageno * BLCKSZ;
     776             :     char        path[MAXPGPATH];
     777             :     int         fd;
     778             :     bool        result;
     779             :     off_t       endpos;
     780             : 
     781             :     /* update the stats counter of checked pages */
     782         214 :     pgstat_count_slru_blocks_exists(ctl->shared->slru_stats_idx);
     783             : 
     784         214 :     SlruFileName(ctl, path, segno);
     785             : 
     786         214 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
     787         214 :     if (fd < 0)
     788             :     {
     789             :         /* expected: file doesn't exist */
     790          52 :         if (errno == ENOENT)
     791          52 :             return false;
     792             : 
     793             :         /* report error normally */
     794           0 :         slru_errcause = SLRU_OPEN_FAILED;
     795           0 :         slru_errno = errno;
     796           0 :         SlruReportIOError(ctl, pageno, 0);
     797             :     }
     798             : 
     799         162 :     if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
     800             :     {
     801           0 :         slru_errcause = SLRU_SEEK_FAILED;
     802           0 :         slru_errno = errno;
     803           0 :         SlruReportIOError(ctl, pageno, 0);
     804             :     }
     805             : 
     806         162 :     result = endpos >= (off_t) (offset + BLCKSZ);
     807             : 
     808         162 :     if (CloseTransientFile(fd) != 0)
     809             :     {
     810           0 :         slru_errcause = SLRU_CLOSE_FAILED;
     811           0 :         slru_errno = errno;
     812           0 :         return false;
     813             :     }
     814             : 
     815         162 :     return result;
     816             : }
     817             : 
     818             : /*
     819             :  * Physical read of a (previously existing) page into a buffer slot
     820             :  *
     821             :  * On failure, we cannot just ereport(ERROR) since caller has put state in
     822             :  * shared memory that must be undone.  So, we return false and save enough
     823             :  * info in static variables to let SlruReportIOError make the report.
     824             :  *
     825             :  * For now, assume it's not worth keeping a file pointer open across
     826             :  * read/write operations.  We could cache one virtual file pointer ...
     827             :  */
     828             : static bool
     829        4086 : SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
     830             : {
     831        4086 :     SlruShared  shared = ctl->shared;
     832        4086 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     833        4086 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     834        4086 :     off_t       offset = rpageno * BLCKSZ;
     835             :     char        path[MAXPGPATH];
     836             :     int         fd;
     837             : 
     838        4086 :     SlruFileName(ctl, path, segno);
     839             : 
     840             :     /*
     841             :      * In a crash-and-restart situation, it's possible for us to receive
     842             :      * commands to set the commit status of transactions whose bits are in
     843             :      * already-truncated segments of the commit log (see notes in
     844             :      * SlruPhysicalWritePage).  Hence, if we are InRecovery, allow the case
     845             :      * where the file doesn't exist, and return zeroes instead.
     846             :      */
     847        4086 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
     848        4086 :     if (fd < 0)
     849             :     {
     850           0 :         if (errno != ENOENT || !InRecovery)
     851             :         {
     852           0 :             slru_errcause = SLRU_OPEN_FAILED;
     853           0 :             slru_errno = errno;
     854           0 :             return false;
     855             :         }
     856             : 
     857           0 :         ereport(LOG,
     858             :                 (errmsg("file \"%s\" doesn't exist, reading as zeroes",
     859             :                         path)));
     860           0 :         MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
     861           0 :         return true;
     862             :     }
     863             : 
     864        4086 :     errno = 0;
     865        4086 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
     866        4086 :     if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
     867             :     {
     868           0 :         pgstat_report_wait_end();
     869           0 :         slru_errcause = SLRU_READ_FAILED;
     870           0 :         slru_errno = errno;
     871           0 :         CloseTransientFile(fd);
     872           0 :         return false;
     873             :     }
     874        4086 :     pgstat_report_wait_end();
     875             : 
     876        4086 :     if (CloseTransientFile(fd) != 0)
     877             :     {
     878           0 :         slru_errcause = SLRU_CLOSE_FAILED;
     879           0 :         slru_errno = errno;
     880           0 :         return false;
     881             :     }
     882             : 
     883        4086 :     return true;
     884             : }
     885             : 
     886             : /*
     887             :  * Physical write of a page from a buffer slot
     888             :  *
     889             :  * On failure, we cannot just ereport(ERROR) since caller has put state in
     890             :  * shared memory that must be undone.  So, we return false and save enough
     891             :  * info in static variables to let SlruReportIOError make the report.
     892             :  *
     893             :  * For now, assume it's not worth keeping a file pointer open across
     894             :  * independent read/write operations.  We do batch operations during
     895             :  * SimpleLruWriteAll, though.
     896             :  *
     897             :  * fdata is NULL for a standalone write, pointer to open-file info during
     898             :  * SimpleLruWriteAll.
     899             :  */
     900             : static bool
     901    14682176 : SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
     902             : {
     903    14682176 :     SlruShared  shared = ctl->shared;
     904    14682176 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
     905    14682176 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     906    14682176 :     off_t       offset = rpageno * BLCKSZ;
     907             :     char        path[MAXPGPATH];
     908    14682176 :     int         fd = -1;
     909             : 
     910             :     /* update the stats counter of written pages */
     911    14682176 :     pgstat_count_slru_blocks_written(shared->slru_stats_idx);
     912             : 
     913             :     /*
     914             :      * Honor the write-WAL-before-data rule, if appropriate, so that we do not
     915             :      * write out data before associated WAL records.  This is the same action
     916             :      * performed during FlushBuffer() in the main buffer manager.
     917             :      */
     918    14682176 :     if (shared->group_lsn != NULL)
     919             :     {
     920             :         /*
     921             :          * We must determine the largest async-commit LSN for the page. This
     922             :          * is a bit tedious, but since this entire function is a slow path
     923             :          * anyway, it seems better to do this here than to maintain a per-page
     924             :          * LSN variable (which'd need an extra comparison in the
     925             :          * transaction-commit path).
     926             :          */
     927             :         XLogRecPtr  max_lsn;
     928             :         int         lsnindex;
     929             : 
     930      865936 :         lsnindex = slotno * shared->lsn_groups_per_page;
     931      865936 :         max_lsn = shared->group_lsn[lsnindex++];
     932   886718464 :         for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
     933             :         {
     934   885852528 :             XLogRecPtr  this_lsn = shared->group_lsn[lsnindex++];
     935             : 
     936   885852528 :             if (max_lsn < this_lsn)
     937       82254 :                 max_lsn = this_lsn;
     938             :         }
     939             : 
     940      865936 :         if (!XLogRecPtrIsInvalid(max_lsn))
     941             :         {
     942             :             /*
     943             :              * As noted above, elog(ERROR) is not acceptable here, so if
     944             :              * XLogFlush were to fail, we must PANIC.  This isn't much of a
     945             :              * restriction because XLogFlush is just about all critical
     946             :              * section anyway, but let's make sure.
     947             :              */
     948        1006 :             START_CRIT_SECTION();
     949        1006 :             XLogFlush(max_lsn);
     950        1006 :             END_CRIT_SECTION();
     951             :         }
     952             :     }
     953             : 
     954             :     /*
     955             :      * During a SimpleLruWriteAll, we may already have the desired file open.
     956             :      */
     957    14682176 :     if (fdata)
     958             :     {
     959        5774 :         for (int i = 0; i < fdata->num_files; i++)
     960             :         {
     961         778 :             if (fdata->segno[i] == segno)
     962             :             {
     963         554 :                 fd = fdata->fd[i];
     964         554 :                 break;
     965             :             }
     966             :         }
     967             :     }
     968             : 
     969    14682176 :     if (fd < 0)
     970             :     {
     971             :         /*
     972             :          * If the file doesn't already exist, we should create it.  It is
     973             :          * possible for this to need to happen when writing a page that's not
     974             :          * first in its segment; we assume the OS can cope with that. (Note:
     975             :          * it might seem that it'd be okay to create files only when
     976             :          * SimpleLruZeroPage is called for the first page of a segment.
     977             :          * However, if after a crash and restart the REDO logic elects to
     978             :          * replay the log from a checkpoint before the latest one, then it's
     979             :          * possible that we will get commands to set transaction status of
     980             :          * transactions that have already been truncated from the commit log.
     981             :          * Easiest way to deal with that is to accept references to
     982             :          * nonexistent files here and in SlruPhysicalReadPage.)
     983             :          *
     984             :          * Note: it is possible for more than one backend to be executing this
     985             :          * code simultaneously for different pages of the same file. Hence,
     986             :          * don't use O_EXCL or O_TRUNC or anything like that.
     987             :          */
     988    14681622 :         SlruFileName(ctl, path, segno);
     989    14681622 :         fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
     990    14681622 :         if (fd < 0)
     991             :         {
     992           0 :             slru_errcause = SLRU_OPEN_FAILED;
     993           0 :             slru_errno = errno;
     994           0 :             return false;
     995             :         }
     996             : 
     997    14681622 :         if (fdata)
     998             :         {
     999        4996 :             if (fdata->num_files < MAX_WRITEALL_BUFFERS)
    1000             :             {
    1001        4996 :                 fdata->fd[fdata->num_files] = fd;
    1002        4996 :                 fdata->segno[fdata->num_files] = segno;
    1003        4996 :                 fdata->num_files++;
    1004             :             }
    1005             :             else
    1006             :             {
    1007             :                 /*
    1008             :                  * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
    1009             :                  * fall back to treating it as a standalone write.
    1010             :                  */
    1011           0 :                 fdata = NULL;
    1012             :             }
    1013             :         }
    1014             :     }
    1015             : 
    1016    14682176 :     errno = 0;
    1017    14682176 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
    1018    14682176 :     if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
    1019             :     {
    1020           0 :         pgstat_report_wait_end();
    1021             :         /* if write didn't set errno, assume problem is no disk space */
    1022           0 :         if (errno == 0)
    1023           0 :             errno = ENOSPC;
    1024           0 :         slru_errcause = SLRU_WRITE_FAILED;
    1025           0 :         slru_errno = errno;
    1026           0 :         if (!fdata)
    1027           0 :             CloseTransientFile(fd);
    1028           0 :         return false;
    1029             :     }
    1030    14682176 :     pgstat_report_wait_end();
    1031             : 
    1032             :     /* Queue up a sync request for the checkpointer. */
    1033    14682176 :     if (ctl->sync_handler != SYNC_HANDLER_NONE)
    1034             :     {
    1035             :         FileTag     tag;
    1036             : 
    1037      867448 :         INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
    1038      867448 :         if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
    1039             :         {
    1040             :             /* No space to enqueue sync request.  Do it synchronously. */
    1041          24 :             pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
    1042          24 :             if (pg_fsync(fd) != 0)
    1043             :             {
    1044           0 :                 pgstat_report_wait_end();
    1045           0 :                 slru_errcause = SLRU_FSYNC_FAILED;
    1046           0 :                 slru_errno = errno;
    1047           0 :                 CloseTransientFile(fd);
    1048           0 :                 return false;
    1049             :             }
    1050          24 :             pgstat_report_wait_end();
    1051             :         }
    1052             :     }
    1053             : 
    1054             :     /* Close file, unless part of flush request. */
    1055    14682176 :     if (!fdata)
    1056             :     {
    1057    14676626 :         if (CloseTransientFile(fd) != 0)
    1058             :         {
    1059           0 :             slru_errcause = SLRU_CLOSE_FAILED;
    1060           0 :             slru_errno = errno;
    1061           0 :             return false;
    1062             :         }
    1063             :     }
    1064             : 
    1065    14682176 :     return true;
    1066             : }
    1067             : 
    1068             : /*
    1069             :  * Issue the error message after failure of SlruPhysicalReadPage or
    1070             :  * SlruPhysicalWritePage.  Call this after cleaning up shared-memory state.
    1071             :  */
    1072             : static void
    1073           0 : SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
    1074             : {
    1075           0 :     int64       segno = pageno / SLRU_PAGES_PER_SEGMENT;
    1076           0 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
    1077           0 :     int         offset = rpageno * BLCKSZ;
    1078             :     char        path[MAXPGPATH];
    1079             : 
    1080           0 :     SlruFileName(ctl, path, segno);
    1081           0 :     errno = slru_errno;
    1082           0 :     switch (slru_errcause)
    1083             :     {
    1084           0 :         case SLRU_OPEN_FAILED:
    1085           0 :             ereport(ERROR,
    1086             :                     (errcode_for_file_access(),
    1087             :                      errmsg("could not access status of transaction %u", xid),
    1088             :                      errdetail("Could not open file \"%s\": %m.", path)));
    1089             :             break;
    1090           0 :         case SLRU_SEEK_FAILED:
    1091           0 :             ereport(ERROR,
    1092             :                     (errcode_for_file_access(),
    1093             :                      errmsg("could not access status of transaction %u", xid),
    1094             :                      errdetail("Could not seek in file \"%s\" to offset %d: %m.",
    1095             :                                path, offset)));
    1096             :             break;
    1097           0 :         case SLRU_READ_FAILED:
    1098           0 :             if (errno)
    1099           0 :                 ereport(ERROR,
    1100             :                         (errcode_for_file_access(),
    1101             :                          errmsg("could not access status of transaction %u", xid),
    1102             :                          errdetail("Could not read from file \"%s\" at offset %d: %m.",
    1103             :                                    path, offset)));
    1104             :             else
    1105           0 :                 ereport(ERROR,
    1106             :                         (errmsg("could not access status of transaction %u", xid),
    1107             :                          errdetail("Could not read from file \"%s\" at offset %d: read too few bytes.", path, offset)));
    1108             :             break;
    1109           0 :         case SLRU_WRITE_FAILED:
    1110           0 :             if (errno)
    1111           0 :                 ereport(ERROR,
    1112             :                         (errcode_for_file_access(),
    1113             :                          errmsg("could not access status of transaction %u", xid),
    1114             :                          errdetail("Could not write to file \"%s\" at offset %d: %m.",
    1115             :                                    path, offset)));
    1116             :             else
    1117           0 :                 ereport(ERROR,
    1118             :                         (errmsg("could not access status of transaction %u", xid),
    1119             :                          errdetail("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
    1120             :                                    path, offset)));
    1121             :             break;
    1122           0 :         case SLRU_FSYNC_FAILED:
    1123           0 :             ereport(data_sync_elevel(ERROR),
    1124             :                     (errcode_for_file_access(),
    1125             :                      errmsg("could not access status of transaction %u", xid),
    1126             :                      errdetail("Could not fsync file \"%s\": %m.",
    1127             :                                path)));
    1128           0 :             break;
    1129           0 :         case SLRU_CLOSE_FAILED:
    1130           0 :             ereport(ERROR,
    1131             :                     (errcode_for_file_access(),
    1132             :                      errmsg("could not access status of transaction %u", xid),
    1133             :                      errdetail("Could not close file \"%s\": %m.",
    1134             :                                path)));
    1135             :             break;
    1136           0 :         default:
    1137             :             /* can't get here, we trust */
    1138           0 :             elog(ERROR, "unrecognized SimpleLru error cause: %d",
    1139             :                  (int) slru_errcause);
    1140             :             break;
    1141             :     }
    1142           0 : }
    1143             : 
    1144             : /*
    1145             :  * Mark a buffer slot "most recently used".
    1146             :  */
    1147             : static inline void
    1148    16386818 : SlruRecentlyUsed(SlruShared shared, int slotno)
    1149             : {
    1150    16386818 :     int         bankno = SlotGetBankNumber(slotno);
    1151    16386818 :     int         new_lru_count = shared->bank_cur_lru_count[bankno];
    1152             : 
    1153             :     Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
    1154             : 
    1155             :     /*
    1156             :      * The reason for the if-test is that there are often many consecutive
    1157             :      * accesses to the same page (particularly the latest page).  By
    1158             :      * suppressing useless increments of bank_cur_lru_count, we reduce the
    1159             :      * probability that old pages' counts will "wrap around" and make them
    1160             :      * appear recently used.
    1161             :      *
    1162             :      * We allow this code to be executed concurrently by multiple processes
    1163             :      * within SimpleLruReadPage_ReadOnly().  As long as int reads and writes
    1164             :      * are atomic, this should not cause any completely-bogus values to enter
    1165             :      * the computation.  However, it is possible for either bank_cur_lru_count
    1166             :      * or individual page_lru_count entries to be "reset" to lower values than
    1167             :      * they should have, in case a process is delayed while it executes this
    1168             :      * function.  With care in SlruSelectLRUPage(), this does little harm, and
    1169             :      * in any case the absolute worst possible consequence is a nonoptimal
    1170             :      * choice of page to evict.  The gain from allowing concurrent reads of
    1171             :      * SLRU pages seems worth it.
    1172             :      */
    1173    16386818 :     if (new_lru_count != shared->page_lru_count[slotno])
    1174             :     {
    1175    14683566 :         shared->bank_cur_lru_count[bankno] = ++new_lru_count;
    1176    14683566 :         shared->page_lru_count[slotno] = new_lru_count;
    1177             :     }
    1178    16386818 : }
    1179             : 
    1180             : /*
    1181             :  * Select the slot to re-use when we need a free slot for the given page.
    1182             :  *
    1183             :  * The target page number is passed not only because we need to know the
    1184             :  * correct bank to use, but also because we need to consider the possibility
    1185             :  * that some other process reads in the target page while we are doing I/O to
    1186             :  * free a slot.  Hence, check or recheck to see if any slot already holds the
    1187             :  * target page, and return that slot if so.  Thus, the returned slot is
    1188             :  * *either* a slot already holding the pageno (could be any state except
    1189             :  * EMPTY), *or* a freeable slot (state EMPTY or CLEAN).
    1190             :  *
    1191             :  * The correct bank lock must be held at entry, and will be held at exit.
    1192             :  */
    1193             : static int
    1194    15007598 : SlruSelectLRUPage(SlruCtl ctl, int64 pageno)
    1195             : {
    1196    15007598 :     SlruShared  shared = ctl->shared;
    1197             : 
    1198             :     /* Outer loop handles restart after I/O */
    1199             :     for (;;)
    1200    14675808 :     {
    1201             :         int         cur_count;
    1202    29683406 :         int         bestvalidslot = 0;  /* keep compiler quiet */
    1203    29683406 :         int         best_valid_delta = -1;
    1204    29683406 :         int64       best_valid_page_number = 0; /* keep compiler quiet */
    1205    29683406 :         int         bestinvalidslot = 0;    /* keep compiler quiet */
    1206    29683406 :         int         best_invalid_delta = -1;
    1207    29683406 :         int64       best_invalid_page_number = 0;   /* keep compiler quiet */
    1208    29683406 :         int         bankno = pageno % ctl->nbanks;
    1209    29683406 :         int         bankstart = bankno * SLRU_BANK_SIZE;
    1210    29683406 :         int         bankend = bankstart + SLRU_BANK_SIZE;
    1211             : 
    1212             :         Assert(LWLockHeldByMe(SimpleLruGetBankLock(ctl, pageno)));
    1213             : 
    1214             :         /* See if page already has a buffer assigned */
    1215   499427388 :         for (int slotno = bankstart; slotno < bankend; slotno++)
    1216             :         {
    1217   470068600 :             if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
    1218   469971860 :                 shared->page_number[slotno] == pageno)
    1219      324618 :                 return slotno;
    1220             :         }
    1221             : 
    1222             :         /*
    1223             :          * If we find any EMPTY slot, just select that one. Else choose a
    1224             :          * victim page to replace.  We normally take the least recently used
    1225             :          * valid page, but we will never take the slot containing
    1226             :          * latest_page_number, even if it appears least recently used.  We
    1227             :          * will select a slot that is already I/O busy only if there is no
    1228             :          * other choice: a read-busy slot will not be least recently used once
    1229             :          * the read finishes, and waiting for an I/O on a write-busy slot is
    1230             :          * inferior to just picking some other slot.  Testing shows the slot
    1231             :          * we pick instead will often be clean, allowing us to begin a read at
    1232             :          * once.
    1233             :          *
    1234             :          * Normally the page_lru_count values will all be different and so
    1235             :          * there will be a well-defined LRU page.  But since we allow
    1236             :          * concurrent execution of SlruRecentlyUsed() within
    1237             :          * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
    1238             :          * acquire the same lru_count values.  In that case we break ties by
    1239             :          * choosing the furthest-back page.
    1240             :          *
    1241             :          * Notice that this next line forcibly advances cur_lru_count to a
    1242             :          * value that is certainly beyond any value that will be in the
    1243             :          * page_lru_count array after the loop finishes.  This ensures that
    1244             :          * the next execution of SlruRecentlyUsed will mark the page newly
    1245             :          * used, even if it's for a page that has the current counter value.
    1246             :          * That gets us back on the path to having good data when there are
    1247             :          * multiple pages with the same lru_count.
    1248             :          */
    1249    29358788 :         cur_count = (shared->bank_cur_lru_count[bankno])++;
    1250   499002764 :         for (int slotno = bankstart; slotno < bankend; slotno++)
    1251             :         {
    1252             :             int         this_delta;
    1253             :             int64       this_page_number;
    1254             : 
    1255   469650242 :             if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1256        6266 :                 return slotno;
    1257             : 
    1258   469643976 :             this_delta = cur_count - shared->page_lru_count[slotno];
    1259   469643976 :             if (this_delta < 0)
    1260             :             {
    1261             :                 /*
    1262             :                  * Clean up in case shared updates have caused cur_count
    1263             :                  * increments to get "lost".  We back off the page counts,
    1264             :                  * rather than trying to increase cur_count, to avoid any
    1265             :                  * question of infinite loops or failure in the presence of
    1266             :                  * wrapped-around counts.
    1267             :                  */
    1268           0 :                 shared->page_lru_count[slotno] = cur_count;
    1269           0 :                 this_delta = 0;
    1270             :             }
    1271             : 
    1272             :             /*
    1273             :              * If this page is the one most recently zeroed, don't consider it
    1274             :              * an eviction candidate. See comments in SimpleLruZeroPage for an
    1275             :              * explanation about the lack of a memory barrier here.
    1276             :              */
    1277   469643976 :             this_page_number = shared->page_number[slotno];
    1278   469643976 :             if (this_page_number ==
    1279   469643976 :                 pg_atomic_read_u64(&shared->latest_page_number))
    1280        1024 :                 continue;
    1281             : 
    1282   469642952 :             if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1283             :             {
    1284   469641372 :                 if (this_delta > best_valid_delta ||
    1285           0 :                     (this_delta == best_valid_delta &&
    1286           0 :                      ctl->PagePrecedes(this_page_number,
    1287             :                                        best_valid_page_number)))
    1288             :                 {
    1289    75786974 :                     bestvalidslot = slotno;
    1290    75786974 :                     best_valid_delta = this_delta;
    1291    75786974 :                     best_valid_page_number = this_page_number;
    1292             :                 }
    1293             :             }
    1294             :             else
    1295             :             {
    1296        1580 :                 if (this_delta > best_invalid_delta ||
    1297           0 :                     (this_delta == best_invalid_delta &&
    1298           0 :                      ctl->PagePrecedes(this_page_number,
    1299             :                                        best_invalid_page_number)))
    1300             :                 {
    1301        1580 :                     bestinvalidslot = slotno;
    1302        1580 :                     best_invalid_delta = this_delta;
    1303        1580 :                     best_invalid_page_number = this_page_number;
    1304             :                 }
    1305             :             }
    1306             :         }
    1307             : 
    1308             :         /*
    1309             :          * If all pages (except possibly the latest one) are I/O busy, we'll
    1310             :          * have to wait for an I/O to complete and then retry.  In that
    1311             :          * unhappy case, we choose to wait for the I/O on the least recently
    1312             :          * used slot, on the assumption that it was likely initiated first of
    1313             :          * all the I/Os in progress and may therefore finish first.
    1314             :          */
    1315    29352522 :         if (best_valid_delta < 0)
    1316             :         {
    1317           0 :             SimpleLruWaitIO(ctl, bestinvalidslot);
    1318           0 :             continue;
    1319             :         }
    1320             : 
    1321             :         /*
    1322             :          * If the selected page is clean, we're set.
    1323             :          */
    1324    29352522 :         if (!shared->page_dirty[bestvalidslot])
    1325    14676714 :             return bestvalidslot;
    1326             : 
    1327             :         /*
    1328             :          * Write the page.
    1329             :          */
    1330    14675808 :         SlruInternalWritePage(ctl, bestvalidslot, NULL);
    1331             : 
    1332             :         /*
    1333             :          * Now loop back and try again.  This is the easiest way of dealing
    1334             :          * with corner cases such as the victim page being re-dirtied while we
    1335             :          * wrote it.
    1336             :          */
    1337             :     }
    1338             : }
    1339             : 
    1340             : /*
    1341             :  * Write dirty pages to disk during checkpoint or database shutdown.  Flushing
    1342             :  * is deferred until the next call to ProcessSyncRequests(), though we do fsync
    1343             :  * the containing directory here to make sure that newly created directory
    1344             :  * entries are on disk.
    1345             :  */
    1346             : void
    1347       17130 : SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
    1348             : {
    1349       17130 :     SlruShared  shared = ctl->shared;
    1350             :     SlruWriteAllData fdata;
    1351       17130 :     int64       pageno = 0;
    1352       17130 :     int         prevbank = SlotGetBankNumber(0);
    1353             :     bool        ok;
    1354             : 
    1355             :     /* update the stats counter of flushes */
    1356       17130 :     pgstat_count_slru_flush(shared->slru_stats_idx);
    1357             : 
    1358             :     /*
    1359             :      * Find and write dirty pages
    1360             :      */
    1361       17130 :     fdata.num_files = 0;
    1362             : 
    1363       17130 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1364             : 
    1365      415498 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1366             :     {
    1367      398368 :         int         curbank = SlotGetBankNumber(slotno);
    1368             : 
    1369             :         /*
    1370             :          * If the current bank lock is not same as the previous bank lock then
    1371             :          * release the previous lock and acquire the new lock.
    1372             :          */
    1373      398368 :         if (curbank != prevbank)
    1374             :         {
    1375        7768 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1376        7768 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1377        7768 :             prevbank = curbank;
    1378             :         }
    1379             : 
    1380             :         /* Do nothing if slot is unused */
    1381      398368 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1382      387310 :             continue;
    1383             : 
    1384       11058 :         SlruInternalWritePage(ctl, slotno, &fdata);
    1385             : 
    1386             :         /*
    1387             :          * In some places (e.g. checkpoints), we cannot assert that the slot
    1388             :          * is clean now, since another process might have re-dirtied it
    1389             :          * already.  That's okay.
    1390             :          */
    1391             :         Assert(allow_redirtied ||
    1392             :                shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
    1393             :                (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1394             :                 !shared->page_dirty[slotno]));
    1395             :     }
    1396             : 
    1397       17130 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1398             : 
    1399             :     /*
    1400             :      * Now close any files that were open
    1401             :      */
    1402       17130 :     ok = true;
    1403       22126 :     for (int i = 0; i < fdata.num_files; i++)
    1404             :     {
    1405        4996 :         if (CloseTransientFile(fdata.fd[i]) != 0)
    1406             :         {
    1407           0 :             slru_errcause = SLRU_CLOSE_FAILED;
    1408           0 :             slru_errno = errno;
    1409           0 :             pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
    1410           0 :             ok = false;
    1411             :         }
    1412             :     }
    1413       17130 :     if (!ok)
    1414           0 :         SlruReportIOError(ctl, pageno, InvalidTransactionId);
    1415             : 
    1416             :     /* Ensure that directory entries for new files are on disk. */
    1417       17130 :     if (ctl->sync_handler != SYNC_HANDLER_NONE)
    1418       13712 :         fsync_fname(ctl->Dir, true);
    1419       17130 : }
    1420             : 
    1421             : /*
    1422             :  * Remove all segments before the one holding the passed page number
    1423             :  *
    1424             :  * All SLRUs prevent concurrent calls to this function, either with an LWLock
    1425             :  * or by calling it only as part of a checkpoint.  Mutual exclusion must begin
    1426             :  * before computing cutoffPage.  Mutual exclusion must end after any limit
    1427             :  * update that would permit other backends to write fresh data into the
    1428             :  * segment immediately preceding the one containing cutoffPage.  Otherwise,
    1429             :  * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
    1430             :  * after it has accrued freshly-written data.
    1431             :  */
    1432             : void
    1433        3562 : SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
    1434             : {
    1435        3562 :     SlruShared  shared = ctl->shared;
    1436             :     int         prevbank;
    1437             : 
    1438             :     /* update the stats counter of truncates */
    1439        3562 :     pgstat_count_slru_truncate(shared->slru_stats_idx);
    1440             : 
    1441             :     /*
    1442             :      * Scan shared memory and remove any pages preceding the cutoff page, to
    1443             :      * ensure we won't rewrite them later.  (Since this is normally called in
    1444             :      * or just after a checkpoint, any dirty pages should have been flushed
    1445             :      * already ... we're just being extra careful here.)
    1446             :      */
    1447        3752 : restart:
    1448             : 
    1449             :     /*
    1450             :      * An important safety check: the current endpoint page must not be
    1451             :      * eligible for removal.  This check is just a backstop against wraparound
    1452             :      * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
    1453             :      * outdated value; therefore we don't add a memory barrier.
    1454             :      */
    1455        3752 :     if (ctl->PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number),
    1456             :                           cutoffPage))
    1457             :     {
    1458           0 :         ereport(LOG,
    1459             :                 (errmsg("could not truncate directory \"%s\": apparent wraparound",
    1460             :                         ctl->Dir)));
    1461           0 :         return;
    1462             :     }
    1463             : 
    1464        3752 :     prevbank = SlotGetBankNumber(0);
    1465        3752 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1466       89768 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1467             :     {
    1468       86206 :         int         curbank = SlotGetBankNumber(slotno);
    1469             : 
    1470             :         /*
    1471             :          * If the current bank lock is not same as the previous bank lock then
    1472             :          * release the previous lock and acquire the new lock.
    1473             :          */
    1474       86206 :         if (curbank != prevbank)
    1475             :         {
    1476        1730 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1477        1730 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1478        1730 :             prevbank = curbank;
    1479             :         }
    1480             : 
    1481       86206 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1482       75692 :             continue;
    1483       10514 :         if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
    1484        9978 :             continue;
    1485             : 
    1486             :         /*
    1487             :          * If page is clean, just change state to EMPTY (expected case).
    1488             :          */
    1489         536 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1490         536 :             !shared->page_dirty[slotno])
    1491             :         {
    1492         346 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
    1493         346 :             continue;
    1494             :         }
    1495             : 
    1496             :         /*
    1497             :          * Hmm, we have (or may have) I/O operations acting on the page, so
    1498             :          * we've got to wait for them to finish and then start again. This is
    1499             :          * the same logic as in SlruSelectLRUPage.  (XXX if page is dirty,
    1500             :          * wouldn't it be OK to just discard it without writing it?
    1501             :          * SlruMayDeleteSegment() uses a stricter qualification, so we might
    1502             :          * not delete this page in the end; even if we don't delete it, we
    1503             :          * won't have cause to read its data again.  For now, keep the logic
    1504             :          * the same as it was.)
    1505             :          */
    1506         190 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1507         190 :             SlruInternalWritePage(ctl, slotno, NULL);
    1508             :         else
    1509           0 :             SimpleLruWaitIO(ctl, slotno);
    1510             : 
    1511         190 :         LWLockRelease(&shared->bank_locks[prevbank].lock);
    1512         190 :         goto restart;
    1513             :     }
    1514             : 
    1515        3562 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1516             : 
    1517             :     /* Now we can remove the old segment(s) */
    1518        3562 :     (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
    1519             : }
    1520             : 
    1521             : /*
    1522             :  * Delete an individual SLRU segment.
    1523             :  *
    1524             :  * NB: This does not touch the SLRU buffers themselves, callers have to ensure
    1525             :  * they either can't yet contain anything, or have already been cleaned out.
    1526             :  */
    1527             : static void
    1528      284100 : SlruInternalDeleteSegment(SlruCtl ctl, int64 segno)
    1529             : {
    1530             :     char        path[MAXPGPATH];
    1531             : 
    1532             :     /* Forget any fsync requests queued for this segment. */
    1533      284100 :     if (ctl->sync_handler != SYNC_HANDLER_NONE)
    1534             :     {
    1535             :         FileTag     tag;
    1536             : 
    1537       26544 :         INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
    1538       26544 :         RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
    1539             :     }
    1540             : 
    1541             :     /* Unlink the file. */
    1542      284100 :     SlruFileName(ctl, path, segno);
    1543      284100 :     ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
    1544      284100 :     unlink(path);
    1545      284100 : }
    1546             : 
    1547             : /*
    1548             :  * Delete an individual SLRU segment, identified by the segment number.
    1549             :  */
    1550             : void
    1551           4 : SlruDeleteSegment(SlruCtl ctl, int64 segno)
    1552             : {
    1553           4 :     SlruShared  shared = ctl->shared;
    1554           4 :     int         prevbank = SlotGetBankNumber(0);
    1555             :     bool        did_write;
    1556             : 
    1557             :     /* Clean out any possibly existing references to the segment. */
    1558           4 :     LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
    1559           4 : restart:
    1560           4 :     did_write = false;
    1561          68 :     for (int slotno = 0; slotno < shared->num_slots; slotno++)
    1562             :     {
    1563             :         int64       pagesegno;
    1564          64 :         int         curbank = SlotGetBankNumber(slotno);
    1565             : 
    1566             :         /*
    1567             :          * If the current bank lock is not same as the previous bank lock then
    1568             :          * release the previous lock and acquire the new lock.
    1569             :          */
    1570          64 :         if (curbank != prevbank)
    1571             :         {
    1572           0 :             LWLockRelease(&shared->bank_locks[prevbank].lock);
    1573           0 :             LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
    1574           0 :             prevbank = curbank;
    1575             :         }
    1576             : 
    1577          64 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1578           0 :             continue;
    1579             : 
    1580          64 :         pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
    1581             :         /* not the segment we're looking for */
    1582          64 :         if (pagesegno != segno)
    1583          14 :             continue;
    1584             : 
    1585             :         /* If page is clean, just change state to EMPTY (expected case). */
    1586          50 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1587          50 :             !shared->page_dirty[slotno])
    1588             :         {
    1589          50 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
    1590          50 :             continue;
    1591             :         }
    1592             : 
    1593             :         /* Same logic as SimpleLruTruncate() */
    1594           0 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1595           0 :             SlruInternalWritePage(ctl, slotno, NULL);
    1596             :         else
    1597           0 :             SimpleLruWaitIO(ctl, slotno);
    1598             : 
    1599           0 :         did_write = true;
    1600             :     }
    1601             : 
    1602             :     /*
    1603             :      * Be extra careful and re-check. The IO functions release the control
    1604             :      * lock, so new pages could have been read in.
    1605             :      */
    1606           4 :     if (did_write)
    1607           0 :         goto restart;
    1608             : 
    1609           4 :     SlruInternalDeleteSegment(ctl, segno);
    1610             : 
    1611           4 :     LWLockRelease(&shared->bank_locks[prevbank].lock);
    1612           4 : }
    1613             : 
    1614             : /*
    1615             :  * Determine whether a segment is okay to delete.
    1616             :  *
    1617             :  * segpage is the first page of the segment, and cutoffPage is the oldest (in
    1618             :  * PagePrecedes order) page in the SLRU containing still-useful data.  Since
    1619             :  * every core PagePrecedes callback implements "wrap around", check the
    1620             :  * segment's first and last pages:
    1621             :  *
    1622             :  * first<cutoff  && last<cutoff:  yes
    1623             :  * first<cutoff  && last>=cutoff: no; cutoff falls inside this segment
    1624             :  * first>=cutoff && last<cutoff:  no; wrap point falls inside this segment
    1625             :  * first>=cutoff && last>=cutoff: no; every page of this segment is too young
    1626             :  */
    1627             : static bool
    1628     2891066 : SlruMayDeleteSegment(SlruCtl ctl, int64 segpage, int64 cutoffPage)
    1629             : {
    1630     2891066 :     int64       seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
    1631             : 
    1632             :     Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
    1633             : 
    1634     3176506 :     return (ctl->PagePrecedes(segpage, cutoffPage) &&
    1635      285440 :             ctl->PagePrecedes(seg_last_page, cutoffPage));
    1636             : }
    1637             : 
    1638             : #ifdef USE_ASSERT_CHECKING
    1639             : static void
    1640             : SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
    1641             : {
    1642             :     TransactionId lhs,
    1643             :                 rhs;
    1644             :     int64       newestPage,
    1645             :                 oldestPage;
    1646             :     TransactionId newestXact,
    1647             :                 oldestXact;
    1648             : 
    1649             :     /*
    1650             :      * Compare an XID pair having undefined order (see RFC 1982), a pair at
    1651             :      * "opposite ends" of the XID space.  TransactionIdPrecedes() treats each
    1652             :      * as preceding the other.  If RHS is oldestXact, LHS is the first XID we
    1653             :      * must not assign.
    1654             :      */
    1655             :     lhs = per_page + offset;    /* skip first page to avoid non-normal XIDs */
    1656             :     rhs = lhs + (1U << 31);
    1657             :     Assert(TransactionIdPrecedes(lhs, rhs));
    1658             :     Assert(TransactionIdPrecedes(rhs, lhs));
    1659             :     Assert(!TransactionIdPrecedes(lhs - 1, rhs));
    1660             :     Assert(TransactionIdPrecedes(rhs, lhs - 1));
    1661             :     Assert(TransactionIdPrecedes(lhs + 1, rhs));
    1662             :     Assert(!TransactionIdPrecedes(rhs, lhs + 1));
    1663             :     Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
    1664             :     Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
    1665             :     Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
    1666             :     Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
    1667             :     Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
    1668             :     Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
    1669             :     Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
    1670             :     Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
    1671             :     Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
    1672             :            || (1U << 31) % per_page != 0);    /* See CommitTsPagePrecedes() */
    1673             :     Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
    1674             :            || (1U << 31) % per_page != 0);
    1675             :     Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
    1676             :     Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
    1677             :     Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
    1678             : 
    1679             :     /*
    1680             :      * GetNewTransactionId() has assigned the last XID it can safely use, and
    1681             :      * that XID is in the *LAST* page of the second segment.  We must not
    1682             :      * delete that segment.
    1683             :      */
    1684             :     newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
    1685             :     newestXact = newestPage * per_page + offset;
    1686             :     Assert(newestXact / per_page == newestPage);
    1687             :     oldestXact = newestXact + 1;
    1688             :     oldestXact -= 1U << 31;
    1689             :     oldestPage = oldestXact / per_page;
    1690             :     Assert(!SlruMayDeleteSegment(ctl,
    1691             :                                  (newestPage -
    1692             :                                   newestPage % SLRU_PAGES_PER_SEGMENT),
    1693             :                                  oldestPage));
    1694             : 
    1695             :     /*
    1696             :      * GetNewTransactionId() has assigned the last XID it can safely use, and
    1697             :      * that XID is in the *FIRST* page of the second segment.  We must not
    1698             :      * delete that segment.
    1699             :      */
    1700             :     newestPage = SLRU_PAGES_PER_SEGMENT;
    1701             :     newestXact = newestPage * per_page + offset;
    1702             :     Assert(newestXact / per_page == newestPage);
    1703             :     oldestXact = newestXact + 1;
    1704             :     oldestXact -= 1U << 31;
    1705             :     oldestPage = oldestXact / per_page;
    1706             :     Assert(!SlruMayDeleteSegment(ctl,
    1707             :                                  (newestPage -
    1708             :                                   newestPage % SLRU_PAGES_PER_SEGMENT),
    1709             :                                  oldestPage));
    1710             : }
    1711             : 
    1712             : /*
    1713             :  * Unit-test a PagePrecedes function.
    1714             :  *
    1715             :  * This assumes every uint32 >= FirstNormalTransactionId is a valid key.  It
    1716             :  * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
    1717             :  * (MultiXactMemberCtl separates flags from XIDs.  NotifyCtl has
    1718             :  * variable-length entries, no keys, and no random access.  These unit tests
    1719             :  * do not apply to them.)
    1720             :  */
    1721             : void
    1722             : SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
    1723             : {
    1724             :     /* Test first, middle and last entries of a page. */
    1725             :     SlruPagePrecedesTestOffset(ctl, per_page, 0);
    1726             :     SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
    1727             :     SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
    1728             : }
    1729             : #endif
    1730             : 
    1731             : /*
    1732             :  * SlruScanDirectory callback
    1733             :  *      This callback reports true if there's any segment wholly prior to the
    1734             :  *      one containing the page passed as "data".
    1735             :  */
    1736             : bool
    1737     2492168 : SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int64 segpage,
    1738             :                             void *data)
    1739             : {
    1740     2492168 :     int64       cutoffPage = *(int64 *) data;
    1741             : 
    1742     2492168 :     if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
    1743         202 :         return true;            /* found one; don't iterate any more */
    1744             : 
    1745     2491966 :     return false;               /* keep going */
    1746             : }
    1747             : 
    1748             : /*
    1749             :  * SlruScanDirectory callback.
    1750             :  *      This callback deletes segments prior to the one passed in as "data".
    1751             :  */
    1752             : static bool
    1753      398898 : SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int64 segpage,
    1754             :                           void *data)
    1755             : {
    1756      398898 :     int64       cutoffPage = *(int64 *) data;
    1757             : 
    1758      398898 :     if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
    1759      284080 :         SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
    1760             : 
    1761      398898 :     return false;               /* keep going */
    1762             : }
    1763             : 
    1764             : /*
    1765             :  * SlruScanDirectory callback.
    1766             :  *      This callback deletes all segments.
    1767             :  */
    1768             : bool
    1769          16 : SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int64 segpage, void *data)
    1770             : {
    1771          16 :     SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
    1772             : 
    1773          16 :     return false;               /* keep going */
    1774             : }
    1775             : 
    1776             : /*
    1777             :  * An internal function used by SlruScanDirectory().
    1778             :  *
    1779             :  * Returns true if a file with a name of a given length may be a correct
    1780             :  * SLRU segment.
    1781             :  */
    1782             : static inline bool
    1783     2916086 : SlruCorrectSegmentFilenameLength(SlruCtl ctl, size_t len)
    1784             : {
    1785     2916086 :     if (ctl->long_segment_names)
    1786        4372 :         return (len == 15);     /* see SlruFileName() */
    1787             :     else
    1788             : 
    1789             :         /*
    1790             :          * Commit 638cf09e76d allowed 5-character lengths. Later commit
    1791             :          * 73c986adde5 allowed 6-character length.
    1792             :          *
    1793             :          * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
    1794             :          * numbers, and the corresponding 15-character file names, which may
    1795             :          * eventually deprecate the support for 4, 5, and 6-character names.
    1796             :          */
    1797     2911714 :         return (len == 4 || len == 5 || len == 6);
    1798             : }
    1799             : 
    1800             : /*
    1801             :  * Scan the SimpleLru directory and apply a callback to each file found in it.
    1802             :  *
    1803             :  * If the callback returns true, the scan is stopped.  The last return value
    1804             :  * from the callback is returned.
    1805             :  *
    1806             :  * The callback receives the following arguments: 1. the SlruCtl struct for the
    1807             :  * slru being truncated; 2. the filename being considered; 3. the page number
    1808             :  * for the first page of that file; 4. a pointer to the opaque data given to us
    1809             :  * by the caller.
    1810             :  *
    1811             :  * Note that the ordering in which the directory is scanned is not guaranteed.
    1812             :  *
    1813             :  * Note that no locking is applied.
    1814             :  */
    1815             : bool
    1816       12562 : SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
    1817             : {
    1818       12562 :     bool        retval = false;
    1819             :     DIR        *cldir;
    1820             :     struct dirent *clde;
    1821             :     int64       segno;
    1822             :     int64       segpage;
    1823             : 
    1824       12562 :     cldir = AllocateDir(ctl->Dir);
    1825     2928446 :     while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
    1826             :     {
    1827             :         size_t      len;
    1828             : 
    1829     2916086 :         len = strlen(clde->d_name);
    1830             : 
    1831     2916086 :         if (SlruCorrectSegmentFilenameLength(ctl, len) &&
    1832     2891082 :             strspn(clde->d_name, "0123456789ABCDEF") == len)
    1833             :         {
    1834     2891082 :             segno = strtoi64(clde->d_name, NULL, 16);
    1835     2891082 :             segpage = segno * SLRU_PAGES_PER_SEGMENT;
    1836             : 
    1837     2891082 :             elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
    1838             :                  ctl->Dir, clde->d_name);
    1839     2891082 :             retval = callback(ctl, clde->d_name, segpage, data);
    1840     2891082 :             if (retval)
    1841         202 :                 break;
    1842             :         }
    1843             :     }
    1844       12562 :     FreeDir(cldir);
    1845             : 
    1846       12562 :     return retval;
    1847             : }
    1848             : 
    1849             : /*
    1850             :  * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
    1851             :  * that they can provide the correct "SlruCtl" (otherwise we don't know how to
    1852             :  * build the path), but they just forward to this common implementation that
    1853             :  * performs the fsync.
    1854             :  */
    1855             : int
    1856           4 : SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
    1857             : {
    1858             :     int         fd;
    1859             :     int         save_errno;
    1860             :     int         result;
    1861             : 
    1862           4 :     SlruFileName(ctl, path, ftag->segno);
    1863             : 
    1864           4 :     fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
    1865           4 :     if (fd < 0)
    1866           0 :         return -1;
    1867             : 
    1868           4 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
    1869           4 :     result = pg_fsync(fd);
    1870           4 :     pgstat_report_wait_end();
    1871           4 :     save_errno = errno;
    1872             : 
    1873           4 :     CloseTransientFile(fd);
    1874             : 
    1875           4 :     errno = save_errno;
    1876           4 :     return result;
    1877             : }

Generated by: LCOV version 1.16