LCOV - code coverage report
Current view: top level - src/backend/access/transam - slru.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 279 443 63.0 %
Date: 2019-11-21 13:06:38 Functions: 19 22 86.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * slru.c
       4             :  *      Simple LRU buffering for transaction status logfiles
       5             :  *
       6             :  * We use a simple least-recently-used scheme to manage a pool of page
       7             :  * buffers.  Under ordinary circumstances we expect that write
       8             :  * traffic will occur mostly to the latest page (and to the just-prior
       9             :  * page, soon after a page transition).  Read traffic will probably touch
      10             :  * a larger span of pages, but in any case a fairly small number of page
      11             :  * buffers should be sufficient.  So, we just search the buffers using plain
      12             :  * linear search; there's no need for a hashtable or anything fancy.
      13             :  * The management algorithm is straight LRU except that we will never swap
      14             :  * out the latest page (since we know it's going to be hit again eventually).
      15             :  *
      16             :  * We use a control LWLock to protect the shared data structures, plus
      17             :  * per-buffer LWLocks that synchronize I/O for each buffer.  The control lock
      18             :  * must be held to examine or modify any shared state.  A process that is
      19             :  * reading in or writing out a page buffer does not hold the control lock,
      20             :  * only the per-buffer lock for the buffer it is working on.
      21             :  *
      22             :  * "Holding the control lock" means exclusive lock in all cases except for
      23             :  * SimpleLruReadPage_ReadOnly(); see comments for SlruRecentlyUsed() for
      24             :  * the implications of that.
      25             :  *
      26             :  * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
      27             :  * before releasing the control lock.  The per-buffer lock is released after
      28             :  * completing the I/O, re-acquiring the control lock, and updating the shared
      29             :  * state.  (Deadlock is not possible here, because we never try to initiate
      30             :  * I/O when someone else is already doing I/O on the same buffer.)
      31             :  * To wait for I/O to complete, release the control lock, acquire the
      32             :  * per-buffer lock in shared mode, immediately release the per-buffer lock,
      33             :  * reacquire the control lock, and then recheck state (since arbitrary things
      34             :  * could have happened while we didn't have the lock).
      35             :  *
      36             :  * As with the regular buffer manager, it is possible for another process
      37             :  * to re-dirty a page that is currently being written out.  This is handled
      38             :  * by re-setting the page's page_dirty flag.
      39             :  *
      40             :  *
      41             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
      42             :  * Portions Copyright (c) 1994, Regents of the University of California
      43             :  *
      44             :  * src/backend/access/transam/slru.c
      45             :  *
      46             :  *-------------------------------------------------------------------------
      47             :  */
      48             : #include "postgres.h"
      49             : 
      50             : #include <fcntl.h>
      51             : #include <sys/stat.h>
      52             : #include <unistd.h>
      53             : 
      54             : #include "access/slru.h"
      55             : #include "access/transam.h"
      56             : #include "access/xlog.h"
      57             : #include "miscadmin.h"
      58             : #include "pgstat.h"
      59             : #include "storage/fd.h"
      60             : #include "storage/shmem.h"
      61             : 
      62             : #define SlruFileName(ctl, path, seg) \
      63             :     snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
      64             : 
      65             : /*
      66             :  * During SimpleLruFlush(), we will usually not need to write/fsync more
      67             :  * than one or two physical files, but we may need to write several pages
      68             :  * per file.  We can consolidate the I/O requests by leaving files open
      69             :  * until control returns to SimpleLruFlush().  This data structure remembers
      70             :  * which files are open.
      71             :  */
      72             : #define MAX_FLUSH_BUFFERS   16
      73             : 
      74             : typedef struct SlruFlushData
      75             : {
      76             :     int         num_files;      /* # files actually open */
      77             :     int         fd[MAX_FLUSH_BUFFERS];  /* their FD's */
      78             :     int         segno[MAX_FLUSH_BUFFERS];   /* their log seg#s */
      79             : } SlruFlushData;
      80             : 
      81             : typedef struct SlruFlushData *SlruFlush;
      82             : 
      83             : /*
      84             :  * Macro to mark a buffer slot "most recently used".  Note multiple evaluation
      85             :  * of arguments!
      86             :  *
      87             :  * The reason for the if-test is that there are often many consecutive
      88             :  * accesses to the same page (particularly the latest page).  By suppressing
      89             :  * useless increments of cur_lru_count, we reduce the probability that old
      90             :  * pages' counts will "wrap around" and make them appear recently used.
      91             :  *
      92             :  * We allow this code to be executed concurrently by multiple processes within
      93             :  * SimpleLruReadPage_ReadOnly().  As long as int reads and writes are atomic,
      94             :  * this should not cause any completely-bogus values to enter the computation.
      95             :  * However, it is possible for either cur_lru_count or individual
      96             :  * page_lru_count entries to be "reset" to lower values than they should have,
      97             :  * in case a process is delayed while it executes this macro.  With care in
      98             :  * SlruSelectLRUPage(), this does little harm, and in any case the absolute
      99             :  * worst possible consequence is a nonoptimal choice of page to evict.  The
     100             :  * gain from allowing concurrent reads of SLRU pages seems worth it.
     101             :  */
     102             : #define SlruRecentlyUsed(shared, slotno)    \
     103             :     do { \
     104             :         int     new_lru_count = (shared)->cur_lru_count; \
     105             :         if (new_lru_count != (shared)->page_lru_count[slotno]) { \
     106             :             (shared)->cur_lru_count = ++new_lru_count; \
     107             :             (shared)->page_lru_count[slotno] = new_lru_count; \
     108             :         } \
     109             :     } while (0)
     110             : 
     111             : /* Saved info for SlruReportIOError */
     112             : typedef enum
     113             : {
     114             :     SLRU_OPEN_FAILED,
     115             :     SLRU_SEEK_FAILED,
     116             :     SLRU_READ_FAILED,
     117             :     SLRU_WRITE_FAILED,
     118             :     SLRU_FSYNC_FAILED,
     119             :     SLRU_CLOSE_FAILED
     120             : } SlruErrorCause;
     121             : 
     122             : static SlruErrorCause slru_errcause;
     123             : static int  slru_errno;
     124             : 
     125             : 
     126             : static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
     127             : static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
     128             : static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
     129             : static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
     130             : static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
     131             :                                   SlruFlush fdata);
     132             : static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
     133             : static int  SlruSelectLRUPage(SlruCtl ctl, int pageno);
     134             : 
     135             : static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
     136             :                                       int segpage, void *data);
     137             : static void SlruInternalDeleteSegment(SlruCtl ctl, char *filename);
     138             : 
     139             : /*
     140             :  * Initialization of shared memory
     141             :  */
     142             : 
     143             : Size
     144       26488 : SimpleLruShmemSize(int nslots, int nlsns)
     145             : {
     146             :     Size        sz;
     147             : 
     148             :     /* we assume nslots isn't so large as to risk overflow */
     149       26488 :     sz = MAXALIGN(sizeof(SlruSharedData));
     150       26488 :     sz += MAXALIGN(nslots * sizeof(char *));    /* page_buffer[] */
     151       26488 :     sz += MAXALIGN(nslots * sizeof(SlruPageStatus));    /* page_status[] */
     152       26488 :     sz += MAXALIGN(nslots * sizeof(bool));  /* page_dirty[] */
     153       26488 :     sz += MAXALIGN(nslots * sizeof(int));   /* page_number[] */
     154       26488 :     sz += MAXALIGN(nslots * sizeof(int));   /* page_lru_count[] */
     155       26488 :     sz += MAXALIGN(nslots * sizeof(LWLockPadded));  /* buffer_locks[] */
     156             : 
     157       26488 :     if (nlsns > 0)
     158        3784 :         sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));    /* group_lsn[] */
     159             : 
     160       26488 :     return BUFFERALIGN(sz) + BLCKSZ * nslots;
     161             : }
     162             : 
     163             : void
     164       13230 : SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
     165             :               LWLock *ctllock, const char *subdir, int tranche_id)
     166             : {
     167             :     SlruShared  shared;
     168             :     bool        found;
     169             : 
     170       13230 :     shared = (SlruShared) ShmemInitStruct(name,
     171             :                                           SimpleLruShmemSize(nslots, nlsns),
     172             :                                           &found);
     173             : 
     174       13230 :     if (!IsUnderPostmaster)
     175             :     {
     176             :         /* Initialize locks and shared memory area */
     177             :         char       *ptr;
     178             :         Size        offset;
     179             :         int         slotno;
     180             : 
     181             :         Assert(!found);
     182             : 
     183       13230 :         memset(shared, 0, sizeof(SlruSharedData));
     184             : 
     185       13230 :         shared->ControlLock = ctllock;
     186             : 
     187       13230 :         shared->num_slots = nslots;
     188       13230 :         shared->lsn_groups_per_page = nlsns;
     189             : 
     190       13230 :         shared->cur_lru_count = 0;
     191             : 
     192             :         /* shared->latest_page_number will be set later */
     193             : 
     194       13230 :         ptr = (char *) shared;
     195       13230 :         offset = MAXALIGN(sizeof(SlruSharedData));
     196       13230 :         shared->page_buffer = (char **) (ptr + offset);
     197       13230 :         offset += MAXALIGN(nslots * sizeof(char *));
     198       13230 :         shared->page_status = (SlruPageStatus *) (ptr + offset);
     199       13230 :         offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
     200       13230 :         shared->page_dirty = (bool *) (ptr + offset);
     201       13230 :         offset += MAXALIGN(nslots * sizeof(bool));
     202       13230 :         shared->page_number = (int *) (ptr + offset);
     203       13230 :         offset += MAXALIGN(nslots * sizeof(int));
     204       13230 :         shared->page_lru_count = (int *) (ptr + offset);
     205       13230 :         offset += MAXALIGN(nslots * sizeof(int));
     206             : 
     207             :         /* Initialize LWLocks */
     208       13230 :         shared->buffer_locks = (LWLockPadded *) (ptr + offset);
     209       13230 :         offset += MAXALIGN(nslots * sizeof(LWLockPadded));
     210             : 
     211       13230 :         if (nlsns > 0)
     212             :         {
     213        1890 :             shared->group_lsn = (XLogRecPtr *) (ptr + offset);
     214        1890 :             offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
     215             :         }
     216             : 
     217             :         Assert(strlen(name) + 1 < SLRU_MAX_NAME_LENGTH);
     218       13230 :         strlcpy(shared->lwlock_tranche_name, name, SLRU_MAX_NAME_LENGTH);
     219       13230 :         shared->lwlock_tranche_id = tranche_id;
     220             : 
     221       13230 :         ptr += BUFFERALIGN(offset);
     222      230030 :         for (slotno = 0; slotno < nslots; slotno++)
     223             :         {
     224      216800 :             LWLockInitialize(&shared->buffer_locks[slotno].lock,
     225             :                              shared->lwlock_tranche_id);
     226             : 
     227      216800 :             shared->page_buffer[slotno] = ptr;
     228      216800 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
     229      216800 :             shared->page_dirty[slotno] = false;
     230      216800 :             shared->page_lru_count[slotno] = 0;
     231      216800 :             ptr += BLCKSZ;
     232             :         }
     233             : 
     234             :         /* Should fit to estimated shmem size */
     235             :         Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
     236             :     }
     237             :     else
     238             :         Assert(found);
     239             : 
     240             :     /* Register SLRU tranche in the main tranches array */
     241       13230 :     LWLockRegisterTranche(shared->lwlock_tranche_id,
     242       13230 :                           shared->lwlock_tranche_name);
     243             : 
     244             :     /*
     245             :      * Initialize the unshared control struct, including directory path. We
     246             :      * assume caller set PagePrecedes.
     247             :      */
     248       13230 :     ctl->shared = shared;
     249       13230 :     ctl->do_fsync = true;        /* default behavior */
     250       13230 :     StrNCpy(ctl->Dir, subdir, sizeof(ctl->Dir));
     251       13230 : }
     252             : 
     253             : /*
     254             :  * Initialize (or reinitialize) a page to zeroes.
     255             :  *
     256             :  * The page is not actually written, just set up in shared memory.
     257             :  * The slot number of the new page is returned.
     258             :  *
     259             :  * Control lock must be held at entry, and will be held at exit.
     260             :  */
     261             : int
     262        5092 : SimpleLruZeroPage(SlruCtl ctl, int pageno)
     263             : {
     264        5092 :     SlruShared  shared = ctl->shared;
     265             :     int         slotno;
     266             : 
     267             :     /* Find a suitable buffer slot for the page */
     268        5092 :     slotno = SlruSelectLRUPage(ctl, pageno);
     269             :     Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
     270             :            (shared->page_status[slotno] == SLRU_PAGE_VALID &&
     271             :             !shared->page_dirty[slotno]) ||
     272             :            shared->page_number[slotno] == pageno);
     273             : 
     274             :     /* Mark the slot as containing this page */
     275        5092 :     shared->page_number[slotno] = pageno;
     276        5092 :     shared->page_status[slotno] = SLRU_PAGE_VALID;
     277        5092 :     shared->page_dirty[slotno] = true;
     278        5092 :     SlruRecentlyUsed(shared, slotno);
     279             : 
     280             :     /* Set the buffer to zeroes */
     281        5092 :     MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
     282             : 
     283             :     /* Set the LSNs for this new page to zero */
     284        5092 :     SimpleLruZeroLSNs(ctl, slotno);
     285             : 
     286             :     /* Assume this page is now the latest active page */
     287        5092 :     shared->latest_page_number = pageno;
     288             : 
     289        5092 :     return slotno;
     290             : }
     291             : 
     292             : /*
     293             :  * Zero all the LSNs we store for this slru page.
     294             :  *
     295             :  * This should be called each time we create a new page, and each time we read
     296             :  * in a page from disk into an existing buffer.  (Such an old page cannot
     297             :  * have any interesting LSNs, since we'd have flushed them before writing
     298             :  * the page in the first place.)
     299             :  *
     300             :  * This assumes that InvalidXLogRecPtr is bitwise-all-0.
     301             :  */
     302             : static void
     303        6822 : SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
     304             : {
     305        6822 :     SlruShared  shared = ctl->shared;
     306             : 
     307        6822 :     if (shared->lsn_groups_per_page > 0)
     308        1510 :         MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
     309             :                shared->lsn_groups_per_page * sizeof(XLogRecPtr));
     310        6822 : }
     311             : 
     312             : /*
     313             :  * Wait for any active I/O on a page slot to finish.  (This does not
     314             :  * guarantee that new I/O hasn't been started before we return, though.
     315             :  * In fact the slot might not even contain the same page anymore.)
     316             :  *
     317             :  * Control lock must be held at entry, and will be held at exit.
     318             :  */
     319             : static void
     320           0 : SimpleLruWaitIO(SlruCtl ctl, int slotno)
     321             : {
     322           0 :     SlruShared  shared = ctl->shared;
     323             : 
     324             :     /* See notes at top of file */
     325           0 :     LWLockRelease(shared->ControlLock);
     326           0 :     LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
     327           0 :     LWLockRelease(&shared->buffer_locks[slotno].lock);
     328           0 :     LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
     329             : 
     330             :     /*
     331             :      * If the slot is still in an io-in-progress state, then either someone
     332             :      * already started a new I/O on the slot, or a previous I/O failed and
     333             :      * neglected to reset the page state.  That shouldn't happen, really, but
     334             :      * it seems worth a few extra cycles to check and recover from it. We can
     335             :      * cheaply test for failure by seeing if the buffer lock is still held (we
     336             :      * assume that transaction abort would release the lock).
     337             :      */
     338           0 :     if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
     339           0 :         shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
     340             :     {
     341           0 :         if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
     342             :         {
     343             :             /* indeed, the I/O must have failed */
     344           0 :             if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
     345           0 :                 shared->page_status[slotno] = SLRU_PAGE_EMPTY;
     346             :             else                /* write_in_progress */
     347             :             {
     348           0 :                 shared->page_status[slotno] = SLRU_PAGE_VALID;
     349           0 :                 shared->page_dirty[slotno] = true;
     350             :             }
     351           0 :             LWLockRelease(&shared->buffer_locks[slotno].lock);
     352             :         }
     353             :     }
     354           0 : }
     355             : 
     356             : /*
     357             :  * Find a page in a shared buffer, reading it in if necessary.
     358             :  * The page number must correspond to an already-initialized page.
     359             :  *
     360             :  * If write_ok is true then it is OK to return a page that is in
     361             :  * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
     362             :  * that modification of the page is safe.  If write_ok is false then we
     363             :  * will not return the page until it is not undergoing active I/O.
     364             :  *
     365             :  * The passed-in xid is used only for error reporting, and may be
     366             :  * InvalidTransactionId if no specific xid is associated with the action.
     367             :  *
     368             :  * Return value is the shared-buffer slot number now holding the page.
     369             :  * The buffer's LRU access info is updated.
     370             :  *
     371             :  * Control lock must be held at entry, and will be held at exit.
     372             :  */
     373             : int
     374      265942 : SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
     375             :                   TransactionId xid)
     376             : {
     377      265942 :     SlruShared  shared = ctl->shared;
     378             : 
     379             :     /* Outer loop handles restart if we must wait for someone else's I/O */
     380             :     for (;;)
     381           0 :     {
     382             :         int         slotno;
     383             :         bool        ok;
     384             : 
     385             :         /* See if page already is in memory; if not, pick victim slot */
     386      265942 :         slotno = SlruSelectLRUPage(ctl, pageno);
     387             : 
     388             :         /* Did we find the page in memory? */
     389      531884 :         if (shared->page_number[slotno] == pageno &&
     390      265942 :             shared->page_status[slotno] != SLRU_PAGE_EMPTY)
     391             :         {
     392             :             /*
     393             :              * If page is still being read in, we must wait for I/O.  Likewise
     394             :              * if the page is being written and the caller said that's not OK.
     395             :              */
     396      528424 :             if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
     397      264212 :                 (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
     398           0 :                  !write_ok))
     399             :             {
     400           0 :                 SimpleLruWaitIO(ctl, slotno);
     401             :                 /* Now we must recheck state from the top */
     402           0 :                 continue;
     403             :             }
     404             :             /* Otherwise, it's ready to use */
     405      264212 :             SlruRecentlyUsed(shared, slotno);
     406      264212 :             return slotno;
     407             :         }
     408             : 
     409             :         /* We found no match; assert we selected a freeable slot */
     410             :         Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
     411             :                (shared->page_status[slotno] == SLRU_PAGE_VALID &&
     412             :                 !shared->page_dirty[slotno]));
     413             : 
     414             :         /* Mark the slot read-busy */
     415        1730 :         shared->page_number[slotno] = pageno;
     416        1730 :         shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
     417        1730 :         shared->page_dirty[slotno] = false;
     418             : 
     419             :         /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
     420        1730 :         LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
     421             : 
     422             :         /* Release control lock while doing I/O */
     423        1730 :         LWLockRelease(shared->ControlLock);
     424             : 
     425             :         /* Do the read */
     426        1730 :         ok = SlruPhysicalReadPage(ctl, pageno, slotno);
     427             : 
     428             :         /* Set the LSNs for this newly read-in page to zero */
     429        1730 :         SimpleLruZeroLSNs(ctl, slotno);
     430             : 
     431             :         /* Re-acquire control lock and update page state */
     432        1730 :         LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
     433             : 
     434             :         Assert(shared->page_number[slotno] == pageno &&
     435             :                shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
     436             :                !shared->page_dirty[slotno]);
     437             : 
     438        1730 :         shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
     439             : 
     440        1730 :         LWLockRelease(&shared->buffer_locks[slotno].lock);
     441             : 
     442             :         /* Now it's okay to ereport if we failed */
     443        1730 :         if (!ok)
     444           0 :             SlruReportIOError(ctl, pageno, xid);
     445             : 
     446        1730 :         SlruRecentlyUsed(shared, slotno);
     447        1730 :         return slotno;
     448             :     }
     449             : }
     450             : 
     451             : /*
     452             :  * Find a page in a shared buffer, reading it in if necessary.
     453             :  * The page number must correspond to an already-initialized page.
     454             :  * The caller must intend only read-only access to the page.
     455             :  *
     456             :  * The passed-in xid is used only for error reporting, and may be
     457             :  * InvalidTransactionId if no specific xid is associated with the action.
     458             :  *
     459             :  * Return value is the shared-buffer slot number now holding the page.
     460             :  * The buffer's LRU access info is updated.
     461             :  *
     462             :  * Control lock must NOT be held at entry, but will be held at exit.
     463             :  * It is unspecified whether the lock will be shared or exclusive.
     464             :  */
     465             : int
     466     1071316 : SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
     467             : {
     468     1071316 :     SlruShared  shared = ctl->shared;
     469             :     int         slotno;
     470             : 
     471             :     /* Try to find the page while holding only shared lock */
     472     1071316 :     LWLockAcquire(shared->ControlLock, LW_SHARED);
     473             : 
     474             :     /* See if page is already in a buffer */
     475     1072060 :     for (slotno = 0; slotno < shared->num_slots; slotno++)
     476             :     {
     477     2143468 :         if (shared->page_number[slotno] == pageno &&
     478     2142724 :             shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
     479     1071270 :             shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
     480             :         {
     481             :             /* See comments for SlruRecentlyUsed macro */
     482     1071270 :             SlruRecentlyUsed(shared, slotno);
     483     1071270 :             return slotno;
     484             :         }
     485             :     }
     486             : 
     487             :     /* No luck, so switch to normal exclusive lock and do regular read */
     488          46 :     LWLockRelease(shared->ControlLock);
     489          46 :     LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
     490             : 
     491          46 :     return SimpleLruReadPage(ctl, pageno, true, xid);
     492             : }
     493             : 
     494             : /*
     495             :  * Write a page from a shared buffer, if necessary.
     496             :  * Does nothing if the specified slot is not dirty.
     497             :  *
     498             :  * NOTE: only one write attempt is made here.  Hence, it is possible that
     499             :  * the page is still dirty at exit (if someone else re-dirtied it during
     500             :  * the write).  However, we *do* attempt a fresh write even if the page
     501             :  * is already being written; this is for checkpoints.
     502             :  *
     503             :  * Control lock must be held at entry, and will be held at exit.
     504             :  */
     505             : static void
     506      389498 : SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
     507             : {
     508      389498 :     SlruShared  shared = ctl->shared;
     509      389498 :     int         pageno = shared->page_number[slotno];
     510             :     bool        ok;
     511             : 
     512             :     /* If a write is in progress, wait for it to finish */
     513      778996 :     while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
     514           0 :            shared->page_number[slotno] == pageno)
     515             :     {
     516           0 :         SimpleLruWaitIO(ctl, slotno);
     517             :     }
     518             : 
     519             :     /*
     520             :      * Do nothing if page is not dirty, or if buffer no longer contains the
     521             :      * same page we were called for.
     522             :      */
     523      396768 :     if (!shared->page_dirty[slotno] ||
     524       14540 :         shared->page_status[slotno] != SLRU_PAGE_VALID ||
     525        7270 :         shared->page_number[slotno] != pageno)
     526      382228 :         return;
     527             : 
     528             :     /*
     529             :      * Mark the slot write-busy, and clear the dirtybit.  After this point, a
     530             :      * transaction status update on this page will mark it dirty again.
     531             :      */
     532        7270 :     shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
     533        7270 :     shared->page_dirty[slotno] = false;
     534             : 
     535             :     /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
     536        7270 :     LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
     537             : 
     538             :     /* Release control lock while doing I/O */
     539        7270 :     LWLockRelease(shared->ControlLock);
     540             : 
     541             :     /* Do the write */
     542        7270 :     ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
     543             : 
     544             :     /* If we failed, and we're in a flush, better close the files */
     545        7270 :     if (!ok && fdata)
     546             :     {
     547             :         int         i;
     548             : 
     549           0 :         for (i = 0; i < fdata->num_files; i++)
     550           0 :             CloseTransientFile(fdata->fd[i]);
     551             :     }
     552             : 
     553             :     /* Re-acquire control lock and update page state */
     554        7270 :     LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
     555             : 
     556             :     Assert(shared->page_number[slotno] == pageno &&
     557             :            shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
     558             : 
     559             :     /* If we failed to write, mark the page dirty again */
     560        7270 :     if (!ok)
     561           0 :         shared->page_dirty[slotno] = true;
     562             : 
     563        7270 :     shared->page_status[slotno] = SLRU_PAGE_VALID;
     564             : 
     565        7270 :     LWLockRelease(&shared->buffer_locks[slotno].lock);
     566             : 
     567             :     /* Now it's okay to ereport if we failed */
     568        7270 :     if (!ok)
     569           0 :         SlruReportIOError(ctl, pageno, InvalidTransactionId);
     570             : }
     571             : 
     572             : /*
     573             :  * Wrapper of SlruInternalWritePage, for external callers.
     574             :  * fdata is always passed a NULL here.
     575             :  */
     576             : void
     577        3194 : SimpleLruWritePage(SlruCtl ctl, int slotno)
     578             : {
     579        3194 :     SlruInternalWritePage(ctl, slotno, NULL);
     580        3194 : }
     581             : 
     582             : /*
     583             :  * Return whether the given page exists on disk.
     584             :  *
     585             :  * A false return means that either the file does not exist, or that it's not
     586             :  * large enough to contain the given page.
     587             :  */
     588             : bool
     589          52 : SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno)
     590             : {
     591          52 :     int         segno = pageno / SLRU_PAGES_PER_SEGMENT;
     592          52 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     593          52 :     int         offset = rpageno * BLCKSZ;
     594             :     char        path[MAXPGPATH];
     595             :     int         fd;
     596             :     bool        result;
     597             :     off_t       endpos;
     598             : 
     599          52 :     SlruFileName(ctl, path, segno);
     600             : 
     601          52 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
     602          52 :     if (fd < 0)
     603             :     {
     604             :         /* expected: file doesn't exist */
     605          16 :         if (errno == ENOENT)
     606          16 :             return false;
     607             : 
     608             :         /* report error normally */
     609           0 :         slru_errcause = SLRU_OPEN_FAILED;
     610           0 :         slru_errno = errno;
     611           0 :         SlruReportIOError(ctl, pageno, 0);
     612             :     }
     613             : 
     614          36 :     if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
     615             :     {
     616           0 :         slru_errcause = SLRU_SEEK_FAILED;
     617           0 :         slru_errno = errno;
     618           0 :         SlruReportIOError(ctl, pageno, 0);
     619             :     }
     620             : 
     621          36 :     result = endpos >= (off_t) (offset + BLCKSZ);
     622             : 
     623          36 :     if (CloseTransientFile(fd) != 0)
     624             :     {
     625           0 :         slru_errcause = SLRU_CLOSE_FAILED;
     626           0 :         slru_errno = errno;
     627           0 :         return false;
     628             :     }
     629             : 
     630          36 :     return result;
     631             : }
     632             : 
     633             : /*
     634             :  * Physical read of a (previously existing) page into a buffer slot
     635             :  *
     636             :  * On failure, we cannot just ereport(ERROR) since caller has put state in
     637             :  * shared memory that must be undone.  So, we return false and save enough
     638             :  * info in static variables to let SlruReportIOError make the report.
     639             :  *
     640             :  * For now, assume it's not worth keeping a file pointer open across
     641             :  * read/write operations.  We could cache one virtual file pointer ...
     642             :  */
     643             : static bool
     644        1730 : SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
     645             : {
     646        1730 :     SlruShared  shared = ctl->shared;
     647        1730 :     int         segno = pageno / SLRU_PAGES_PER_SEGMENT;
     648        1730 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     649        1730 :     int         offset = rpageno * BLCKSZ;
     650             :     char        path[MAXPGPATH];
     651             :     int         fd;
     652             : 
     653        1730 :     SlruFileName(ctl, path, segno);
     654             : 
     655             :     /*
     656             :      * In a crash-and-restart situation, it's possible for us to receive
     657             :      * commands to set the commit status of transactions whose bits are in
     658             :      * already-truncated segments of the commit log (see notes in
     659             :      * SlruPhysicalWritePage).  Hence, if we are InRecovery, allow the case
     660             :      * where the file doesn't exist, and return zeroes instead.
     661             :      */
     662        1730 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
     663        1730 :     if (fd < 0)
     664             :     {
     665           0 :         if (errno != ENOENT || !InRecovery)
     666             :         {
     667           0 :             slru_errcause = SLRU_OPEN_FAILED;
     668           0 :             slru_errno = errno;
     669           0 :             return false;
     670             :         }
     671             : 
     672           0 :         ereport(LOG,
     673             :                 (errmsg("file \"%s\" doesn't exist, reading as zeroes",
     674             :                         path)));
     675           0 :         MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
     676           0 :         return true;
     677             :     }
     678             : 
     679        1730 :     if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
     680             :     {
     681           0 :         slru_errcause = SLRU_SEEK_FAILED;
     682           0 :         slru_errno = errno;
     683           0 :         CloseTransientFile(fd);
     684           0 :         return false;
     685             :     }
     686             : 
     687        1730 :     errno = 0;
     688        1730 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
     689        1730 :     if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
     690             :     {
     691           0 :         pgstat_report_wait_end();
     692           0 :         slru_errcause = SLRU_READ_FAILED;
     693           0 :         slru_errno = errno;
     694           0 :         CloseTransientFile(fd);
     695           0 :         return false;
     696             :     }
     697        1730 :     pgstat_report_wait_end();
     698             : 
     699        1730 :     if (CloseTransientFile(fd) != 0)
     700             :     {
     701           0 :         slru_errcause = SLRU_CLOSE_FAILED;
     702           0 :         slru_errno = errno;
     703           0 :         return false;
     704             :     }
     705             : 
     706        1730 :     return true;
     707             : }
     708             : 
     709             : /*
     710             :  * Physical write of a page from a buffer slot
     711             :  *
     712             :  * On failure, we cannot just ereport(ERROR) since caller has put state in
     713             :  * shared memory that must be undone.  So, we return false and save enough
     714             :  * info in static variables to let SlruReportIOError make the report.
     715             :  *
     716             :  * For now, assume it's not worth keeping a file pointer open across
     717             :  * independent read/write operations.  We do batch operations during
     718             :  * SimpleLruFlush, though.
     719             :  *
     720             :  * fdata is NULL for a standalone write, pointer to open-file info during
     721             :  * SimpleLruFlush.
     722             :  */
     723             : static bool
     724        7270 : SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
     725             : {
     726        7270 :     SlruShared  shared = ctl->shared;
     727        7270 :     int         segno = pageno / SLRU_PAGES_PER_SEGMENT;
     728        7270 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     729        7270 :     int         offset = rpageno * BLCKSZ;
     730             :     char        path[MAXPGPATH];
     731        7270 :     int         fd = -1;
     732             : 
     733             :     /*
     734             :      * Honor the write-WAL-before-data rule, if appropriate, so that we do not
     735             :      * write out data before associated WAL records.  This is the same action
     736             :      * performed during FlushBuffer() in the main buffer manager.
     737             :      */
     738        7270 :     if (shared->group_lsn != NULL)
     739             :     {
     740             :         /*
     741             :          * We must determine the largest async-commit LSN for the page. This
     742             :          * is a bit tedious, but since this entire function is a slow path
     743             :          * anyway, it seems better to do this here than to maintain a per-page
     744             :          * LSN variable (which'd need an extra comparison in the
     745             :          * transaction-commit path).
     746             :          */
     747             :         XLogRecPtr  max_lsn;
     748             :         int         lsnindex,
     749             :                     lsnoff;
     750             : 
     751        2260 :         lsnindex = slotno * shared->lsn_groups_per_page;
     752        2260 :         max_lsn = shared->group_lsn[lsnindex++];
     753     2314240 :         for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
     754             :         {
     755     2311980 :             XLogRecPtr  this_lsn = shared->group_lsn[lsnindex++];
     756             : 
     757     2311980 :             if (max_lsn < this_lsn)
     758        2448 :                 max_lsn = this_lsn;
     759             :         }
     760             : 
     761        2260 :         if (!XLogRecPtrIsInvalid(max_lsn))
     762             :         {
     763             :             /*
     764             :              * As noted above, elog(ERROR) is not acceptable here, so if
     765             :              * XLogFlush were to fail, we must PANIC.  This isn't much of a
     766             :              * restriction because XLogFlush is just about all critical
     767             :              * section anyway, but let's make sure.
     768             :              */
     769        1090 :             START_CRIT_SECTION();
     770        1090 :             XLogFlush(max_lsn);
     771        1090 :             END_CRIT_SECTION();
     772             :         }
     773             :     }
     774             : 
     775             :     /*
     776             :      * During a Flush, we may already have the desired file open.
     777             :      */
     778        7270 :     if (fdata)
     779             :     {
     780             :         int         i;
     781             : 
     782        4076 :         for (i = 0; i < fdata->num_files; i++)
     783             :         {
     784          22 :             if (fdata->segno[i] == segno)
     785             :             {
     786          22 :                 fd = fdata->fd[i];
     787          22 :                 break;
     788             :             }
     789             :         }
     790             :     }
     791             : 
     792        7270 :     if (fd < 0)
     793             :     {
     794             :         /*
     795             :          * If the file doesn't already exist, we should create it.  It is
     796             :          * possible for this to need to happen when writing a page that's not
     797             :          * first in its segment; we assume the OS can cope with that. (Note:
     798             :          * it might seem that it'd be okay to create files only when
     799             :          * SimpleLruZeroPage is called for the first page of a segment.
     800             :          * However, if after a crash and restart the REDO logic elects to
     801             :          * replay the log from a checkpoint before the latest one, then it's
     802             :          * possible that we will get commands to set transaction status of
     803             :          * transactions that have already been truncated from the commit log.
     804             :          * Easiest way to deal with that is to accept references to
     805             :          * nonexistent files here and in SlruPhysicalReadPage.)
     806             :          *
     807             :          * Note: it is possible for more than one backend to be executing this
     808             :          * code simultaneously for different pages of the same file. Hence,
     809             :          * don't use O_EXCL or O_TRUNC or anything like that.
     810             :          */
     811        7248 :         SlruFileName(ctl, path, segno);
     812        7248 :         fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
     813        7248 :         if (fd < 0)
     814             :         {
     815           0 :             slru_errcause = SLRU_OPEN_FAILED;
     816           0 :             slru_errno = errno;
     817           0 :             return false;
     818             :         }
     819             : 
     820        7248 :         if (fdata)
     821             :         {
     822        4054 :             if (fdata->num_files < MAX_FLUSH_BUFFERS)
     823             :             {
     824        4054 :                 fdata->fd[fdata->num_files] = fd;
     825        4054 :                 fdata->segno[fdata->num_files] = segno;
     826        4054 :                 fdata->num_files++;
     827             :             }
     828             :             else
     829             :             {
     830             :                 /*
     831             :                  * In the unlikely event that we exceed MAX_FLUSH_BUFFERS,
     832             :                  * fall back to treating it as a standalone write.
     833             :                  */
     834           0 :                 fdata = NULL;
     835             :             }
     836             :         }
     837             :     }
     838             : 
     839        7270 :     if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
     840             :     {
     841           0 :         slru_errcause = SLRU_SEEK_FAILED;
     842           0 :         slru_errno = errno;
     843           0 :         if (!fdata)
     844           0 :             CloseTransientFile(fd);
     845           0 :         return false;
     846             :     }
     847             : 
     848        7270 :     errno = 0;
     849        7270 :     pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
     850        7270 :     if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
     851             :     {
     852           0 :         pgstat_report_wait_end();
     853             :         /* if write didn't set errno, assume problem is no disk space */
     854           0 :         if (errno == 0)
     855           0 :             errno = ENOSPC;
     856           0 :         slru_errcause = SLRU_WRITE_FAILED;
     857           0 :         slru_errno = errno;
     858           0 :         if (!fdata)
     859           0 :             CloseTransientFile(fd);
     860           0 :         return false;
     861             :     }
     862        7270 :     pgstat_report_wait_end();
     863             : 
     864             :     /*
     865             :      * If not part of Flush, need to fsync now.  We assume this happens
     866             :      * infrequently enough that it's not a performance issue.
     867             :      */
     868        7270 :     if (!fdata)
     869             :     {
     870        3194 :         pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
     871        3194 :         if (ctl->do_fsync && pg_fsync(fd) != 0)
     872             :         {
     873           0 :             pgstat_report_wait_end();
     874           0 :             slru_errcause = SLRU_FSYNC_FAILED;
     875           0 :             slru_errno = errno;
     876           0 :             CloseTransientFile(fd);
     877           0 :             return false;
     878             :         }
     879        3194 :         pgstat_report_wait_end();
     880             : 
     881        3194 :         if (CloseTransientFile(fd) != 0)
     882             :         {
     883           0 :             slru_errcause = SLRU_CLOSE_FAILED;
     884           0 :             slru_errno = errno;
     885           0 :             return false;
     886             :         }
     887             :     }
     888             : 
     889        7270 :     return true;
     890             : }
     891             : 
     892             : /*
     893             :  * Issue the error message after failure of SlruPhysicalReadPage or
     894             :  * SlruPhysicalWritePage.  Call this after cleaning up shared-memory state.
     895             :  */
     896             : static void
     897           0 : SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
     898             : {
     899           0 :     int         segno = pageno / SLRU_PAGES_PER_SEGMENT;
     900           0 :     int         rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
     901           0 :     int         offset = rpageno * BLCKSZ;
     902             :     char        path[MAXPGPATH];
     903             : 
     904           0 :     SlruFileName(ctl, path, segno);
     905           0 :     errno = slru_errno;
     906           0 :     switch (slru_errcause)
     907             :     {
     908             :         case SLRU_OPEN_FAILED:
     909           0 :             ereport(ERROR,
     910             :                     (errcode_for_file_access(),
     911             :                      errmsg("could not access status of transaction %u", xid),
     912             :                      errdetail("Could not open file \"%s\": %m.", path)));
     913             :             break;
     914             :         case SLRU_SEEK_FAILED:
     915           0 :             ereport(ERROR,
     916             :                     (errcode_for_file_access(),
     917             :                      errmsg("could not access status of transaction %u", xid),
     918             :                      errdetail("Could not seek in file \"%s\" to offset %u: %m.",
     919             :                                path, offset)));
     920             :             break;
     921             :         case SLRU_READ_FAILED:
     922           0 :             if (errno)
     923           0 :                 ereport(ERROR,
     924             :                         (errcode_for_file_access(),
     925             :                          errmsg("could not access status of transaction %u", xid),
     926             :                          errdetail("Could not read from file \"%s\" at offset %u: %m.",
     927             :                                    path, offset)));
     928             :             else
     929           0 :                 ereport(ERROR,
     930             :                         (errmsg("could not access status of transaction %u", xid),
     931             :                          errdetail("Could not read from file \"%s\" at offset %u: read too few bytes.", path, offset)));
     932             :             break;
     933             :         case SLRU_WRITE_FAILED:
     934           0 :             if (errno)
     935           0 :                 ereport(ERROR,
     936             :                         (errcode_for_file_access(),
     937             :                          errmsg("could not access status of transaction %u", xid),
     938             :                          errdetail("Could not write to file \"%s\" at offset %u: %m.",
     939             :                                    path, offset)));
     940             :             else
     941           0 :                 ereport(ERROR,
     942             :                         (errmsg("could not access status of transaction %u", xid),
     943             :                          errdetail("Could not write to file \"%s\" at offset %u: wrote too few bytes.",
     944             :                                    path, offset)));
     945             :             break;
     946             :         case SLRU_FSYNC_FAILED:
     947           0 :             ereport(data_sync_elevel(ERROR),
     948             :                     (errcode_for_file_access(),
     949             :                      errmsg("could not access status of transaction %u", xid),
     950             :                      errdetail("Could not fsync file \"%s\": %m.",
     951             :                                path)));
     952           0 :             break;
     953             :         case SLRU_CLOSE_FAILED:
     954           0 :             ereport(ERROR,
     955             :                     (errcode_for_file_access(),
     956             :                      errmsg("could not access status of transaction %u", xid),
     957             :                      errdetail("Could not close file \"%s\": %m.",
     958             :                                path)));
     959             :             break;
     960             :         default:
     961             :             /* can't get here, we trust */
     962           0 :             elog(ERROR, "unrecognized SimpleLru error cause: %d",
     963             :                  (int) slru_errcause);
     964             :             break;
     965             :     }
     966           0 : }
     967             : 
     968             : /*
     969             :  * Select the slot to re-use when we need a free slot.
     970             :  *
     971             :  * The target page number is passed because we need to consider the
     972             :  * possibility that some other process reads in the target page while
     973             :  * we are doing I/O to free a slot.  Hence, check or recheck to see if
     974             :  * any slot already holds the target page, and return that slot if so.
     975             :  * Thus, the returned slot is *either* a slot already holding the pageno
     976             :  * (could be any state except EMPTY), *or* a freeable slot (state EMPTY
     977             :  * or CLEAN).
     978             :  *
     979             :  * Control lock must be held at entry, and will be held at exit.
     980             :  */
     981             : static int
     982      271034 : SlruSelectLRUPage(SlruCtl ctl, int pageno)
     983             : {
     984      271034 :     SlruShared  shared = ctl->shared;
     985             : 
     986             :     /* Outer loop handles restart after I/O */
     987             :     for (;;)
     988           0 :     {
     989             :         int         slotno;
     990             :         int         cur_count;
     991      271034 :         int         bestvalidslot = 0;  /* keep compiler quiet */
     992      271034 :         int         best_valid_delta = -1;
     993      271034 :         int         best_valid_page_number = 0; /* keep compiler quiet */
     994      271034 :         int         bestinvalidslot = 0;    /* keep compiler quiet */
     995      271034 :         int         best_invalid_delta = -1;
     996      271034 :         int         best_invalid_page_number = 0;   /* keep compiler quiet */
     997             : 
     998             :         /* See if page already has a buffer assigned */
     999      372518 :         for (slotno = 0; slotno < shared->num_slots; slotno++)
    1000             :         {
    1001      730872 :             if (shared->page_number[slotno] == pageno &&
    1002      364198 :                 shared->page_status[slotno] != SLRU_PAGE_EMPTY)
    1003      265190 :                 return slotno;
    1004             :         }
    1005             : 
    1006             :         /*
    1007             :          * If we find any EMPTY slot, just select that one. Else choose a
    1008             :          * victim page to replace.  We normally take the least recently used
    1009             :          * valid page, but we will never take the slot containing
    1010             :          * latest_page_number, even if it appears least recently used.  We
    1011             :          * will select a slot that is already I/O busy only if there is no
    1012             :          * other choice: a read-busy slot will not be least recently used once
    1013             :          * the read finishes, and waiting for an I/O on a write-busy slot is
    1014             :          * inferior to just picking some other slot.  Testing shows the slot
    1015             :          * we pick instead will often be clean, allowing us to begin a read at
    1016             :          * once.
    1017             :          *
    1018             :          * Normally the page_lru_count values will all be different and so
    1019             :          * there will be a well-defined LRU page.  But since we allow
    1020             :          * concurrent execution of SlruRecentlyUsed() within
    1021             :          * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
    1022             :          * acquire the same lru_count values.  In that case we break ties by
    1023             :          * choosing the furthest-back page.
    1024             :          *
    1025             :          * Notice that this next line forcibly advances cur_lru_count to a
    1026             :          * value that is certainly beyond any value that will be in the
    1027             :          * page_lru_count array after the loop finishes.  This ensures that
    1028             :          * the next execution of SlruRecentlyUsed will mark the page newly
    1029             :          * used, even if it's for a page that has the current counter value.
    1030             :          * That gets us back on the path to having good data when there are
    1031             :          * multiple pages with the same lru_count.
    1032             :          */
    1033        5844 :         cur_count = (shared->cur_lru_count)++;
    1034        5946 :         for (slotno = 0; slotno < shared->num_slots; slotno++)
    1035             :         {
    1036             :             int         this_delta;
    1037             :             int         this_page_number;
    1038             : 
    1039        5946 :             if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1040        5844 :                 return slotno;
    1041         102 :             this_delta = cur_count - shared->page_lru_count[slotno];
    1042         102 :             if (this_delta < 0)
    1043             :             {
    1044             :                 /*
    1045             :                  * Clean up in case shared updates have caused cur_count
    1046             :                  * increments to get "lost".  We back off the page counts,
    1047             :                  * rather than trying to increase cur_count, to avoid any
    1048             :                  * question of infinite loops or failure in the presence of
    1049             :                  * wrapped-around counts.
    1050             :                  */
    1051           0 :                 shared->page_lru_count[slotno] = cur_count;
    1052           0 :                 this_delta = 0;
    1053             :             }
    1054         102 :             this_page_number = shared->page_number[slotno];
    1055         102 :             if (this_page_number == shared->latest_page_number)
    1056          38 :                 continue;
    1057          64 :             if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1058             :             {
    1059          64 :                 if (this_delta > best_valid_delta ||
    1060           0 :                     (this_delta == best_valid_delta &&
    1061           0 :                      ctl->PagePrecedes(this_page_number,
    1062             :                                        best_valid_page_number)))
    1063             :                 {
    1064          24 :                     bestvalidslot = slotno;
    1065          24 :                     best_valid_delta = this_delta;
    1066          24 :                     best_valid_page_number = this_page_number;
    1067             :                 }
    1068             :             }
    1069             :             else
    1070             :             {
    1071           0 :                 if (this_delta > best_invalid_delta ||
    1072           0 :                     (this_delta == best_invalid_delta &&
    1073           0 :                      ctl->PagePrecedes(this_page_number,
    1074             :                                        best_invalid_page_number)))
    1075             :                 {
    1076           0 :                     bestinvalidslot = slotno;
    1077           0 :                     best_invalid_delta = this_delta;
    1078           0 :                     best_invalid_page_number = this_page_number;
    1079             :                 }
    1080             :             }
    1081             :         }
    1082             : 
    1083             :         /*
    1084             :          * If all pages (except possibly the latest one) are I/O busy, we'll
    1085             :          * have to wait for an I/O to complete and then retry.  In that
    1086             :          * unhappy case, we choose to wait for the I/O on the least recently
    1087             :          * used slot, on the assumption that it was likely initiated first of
    1088             :          * all the I/Os in progress and may therefore finish first.
    1089             :          */
    1090           0 :         if (best_valid_delta < 0)
    1091             :         {
    1092           0 :             SimpleLruWaitIO(ctl, bestinvalidslot);
    1093           0 :             continue;
    1094             :         }
    1095             : 
    1096             :         /*
    1097             :          * If the selected page is clean, we're set.
    1098             :          */
    1099           0 :         if (!shared->page_dirty[bestvalidslot])
    1100           0 :             return bestvalidslot;
    1101             : 
    1102             :         /*
    1103             :          * Write the page.
    1104             :          */
    1105           0 :         SlruInternalWritePage(ctl, bestvalidslot, NULL);
    1106             : 
    1107             :         /*
    1108             :          * Now loop back and try again.  This is the easiest way of dealing
    1109             :          * with corner cases such as the victim page being re-dirtied while we
    1110             :          * wrote it.
    1111             :          */
    1112             :     }
    1113             : }
    1114             : 
    1115             : /*
    1116             :  * Flush dirty pages to disk during checkpoint or database shutdown
    1117             :  */
    1118             : void
    1119       19300 : SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
    1120             : {
    1121       19300 :     SlruShared  shared = ctl->shared;
    1122             :     SlruFlushData fdata;
    1123             :     int         slotno;
    1124       19300 :     int         pageno = 0;
    1125             :     int         i;
    1126             :     bool        ok;
    1127             : 
    1128             :     /*
    1129             :      * Find and write dirty pages
    1130             :      */
    1131       19300 :     fdata.num_files = 0;
    1132             : 
    1133       19300 :     LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
    1134             : 
    1135      405604 :     for (slotno = 0; slotno < shared->num_slots; slotno++)
    1136             :     {
    1137      386304 :         SlruInternalWritePage(ctl, slotno, &fdata);
    1138             : 
    1139             :         /*
    1140             :          * In some places (e.g. checkpoints), we cannot assert that the slot
    1141             :          * is clean now, since another process might have re-dirtied it
    1142             :          * already.  That's okay.
    1143             :          */
    1144             :         Assert(allow_redirtied ||
    1145             :                shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
    1146             :                (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1147             :                 !shared->page_dirty[slotno]));
    1148             :     }
    1149             : 
    1150       19300 :     LWLockRelease(shared->ControlLock);
    1151             : 
    1152             :     /*
    1153             :      * Now fsync and close any files that were open
    1154             :      */
    1155       19300 :     ok = true;
    1156       23354 :     for (i = 0; i < fdata.num_files; i++)
    1157             :     {
    1158        4054 :         pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
    1159        4054 :         if (ctl->do_fsync && pg_fsync(fdata.fd[i]) != 0)
    1160             :         {
    1161           0 :             slru_errcause = SLRU_FSYNC_FAILED;
    1162           0 :             slru_errno = errno;
    1163           0 :             pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
    1164           0 :             ok = false;
    1165             :         }
    1166        4054 :         pgstat_report_wait_end();
    1167             : 
    1168        4054 :         if (CloseTransientFile(fdata.fd[i]) != 0)
    1169             :         {
    1170           0 :             slru_errcause = SLRU_CLOSE_FAILED;
    1171           0 :             slru_errno = errno;
    1172           0 :             pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
    1173           0 :             ok = false;
    1174             :         }
    1175             :     }
    1176       19300 :     if (!ok)
    1177           0 :         SlruReportIOError(ctl, pageno, InvalidTransactionId);
    1178       19300 : }
    1179             : 
    1180             : /*
    1181             :  * Remove all segments before the one holding the passed page number
    1182             :  */
    1183             : void
    1184        2784 : SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
    1185             : {
    1186        2784 :     SlruShared  shared = ctl->shared;
    1187             :     int         slotno;
    1188             : 
    1189             :     /*
    1190             :      * The cutoff point is the start of the segment containing cutoffPage.
    1191             :      */
    1192        2784 :     cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT;
    1193             : 
    1194             :     /*
    1195             :      * Scan shared memory and remove any pages preceding the cutoff page, to
    1196             :      * ensure we won't rewrite them later.  (Since this is normally called in
    1197             :      * or just after a checkpoint, any dirty pages should have been flushed
    1198             :      * already ... we're just being extra careful here.)
    1199             :      */
    1200        2784 :     LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
    1201             : 
    1202             : restart:;
    1203             : 
    1204             :     /*
    1205             :      * While we are holding the lock, make an important safety check: the
    1206             :      * planned cutoff point must be <= the current endpoint page. Otherwise we
    1207             :      * have already wrapped around, and proceeding with the truncation would
    1208             :      * risk removing the current segment.
    1209             :      */
    1210        2784 :     if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
    1211             :     {
    1212           0 :         LWLockRelease(shared->ControlLock);
    1213           0 :         ereport(LOG,
    1214             :                 (errmsg("could not truncate directory \"%s\": apparent wraparound",
    1215             :                         ctl->Dir)));
    1216           0 :         return;
    1217             :     }
    1218             : 
    1219       91872 :     for (slotno = 0; slotno < shared->num_slots; slotno++)
    1220             :     {
    1221       89088 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1222       86222 :             continue;
    1223        2866 :         if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
    1224        2866 :             continue;
    1225             : 
    1226             :         /*
    1227             :          * If page is clean, just change state to EMPTY (expected case).
    1228             :          */
    1229           0 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1230           0 :             !shared->page_dirty[slotno])
    1231             :         {
    1232           0 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
    1233           0 :             continue;
    1234             :         }
    1235             : 
    1236             :         /*
    1237             :          * Hmm, we have (or may have) I/O operations acting on the page, so
    1238             :          * we've got to wait for them to finish and then start again. This is
    1239             :          * the same logic as in SlruSelectLRUPage.  (XXX if page is dirty,
    1240             :          * wouldn't it be OK to just discard it without writing it?  For now,
    1241             :          * keep the logic the same as it was.)
    1242             :          */
    1243           0 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1244           0 :             SlruInternalWritePage(ctl, slotno, NULL);
    1245             :         else
    1246           0 :             SimpleLruWaitIO(ctl, slotno);
    1247           0 :         goto restart;
    1248             :     }
    1249             : 
    1250        2784 :     LWLockRelease(shared->ControlLock);
    1251             : 
    1252             :     /* Now we can remove the old segment(s) */
    1253        2784 :     (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
    1254             : }
    1255             : 
    1256             : /*
    1257             :  * Delete an individual SLRU segment, identified by the filename.
    1258             :  *
    1259             :  * NB: This does not touch the SLRU buffers themselves, callers have to ensure
    1260             :  * they either can't yet contain anything, or have already been cleaned out.
    1261             :  */
    1262             : static void
    1263        1494 : SlruInternalDeleteSegment(SlruCtl ctl, char *filename)
    1264             : {
    1265             :     char        path[MAXPGPATH];
    1266             : 
    1267        1494 :     snprintf(path, MAXPGPATH, "%s/%s", ctl->Dir, filename);
    1268        1494 :     ereport(DEBUG2,
    1269             :             (errmsg("removing file \"%s\"", path)));
    1270        1494 :     unlink(path);
    1271        1494 : }
    1272             : 
    1273             : /*
    1274             :  * Delete an individual SLRU segment, identified by the segment number.
    1275             :  */
    1276             : void
    1277           0 : SlruDeleteSegment(SlruCtl ctl, int segno)
    1278             : {
    1279           0 :     SlruShared  shared = ctl->shared;
    1280             :     int         slotno;
    1281             :     char        path[MAXPGPATH];
    1282             :     bool        did_write;
    1283             : 
    1284             :     /* Clean out any possibly existing references to the segment. */
    1285           0 :     LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
    1286             : restart:
    1287           0 :     did_write = false;
    1288           0 :     for (slotno = 0; slotno < shared->num_slots; slotno++)
    1289             :     {
    1290           0 :         int         pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
    1291             : 
    1292           0 :         if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
    1293           0 :             continue;
    1294             : 
    1295             :         /* not the segment we're looking for */
    1296           0 :         if (pagesegno != segno)
    1297           0 :             continue;
    1298             : 
    1299             :         /* If page is clean, just change state to EMPTY (expected case). */
    1300           0 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
    1301           0 :             !shared->page_dirty[slotno])
    1302             :         {
    1303           0 :             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
    1304           0 :             continue;
    1305             :         }
    1306             : 
    1307             :         /* Same logic as SimpleLruTruncate() */
    1308           0 :         if (shared->page_status[slotno] == SLRU_PAGE_VALID)
    1309           0 :             SlruInternalWritePage(ctl, slotno, NULL);
    1310             :         else
    1311           0 :             SimpleLruWaitIO(ctl, slotno);
    1312             : 
    1313           0 :         did_write = true;
    1314             :     }
    1315             : 
    1316             :     /*
    1317             :      * Be extra careful and re-check. The IO functions release the control
    1318             :      * lock, so new pages could have been read in.
    1319             :      */
    1320           0 :     if (did_write)
    1321           0 :         goto restart;
    1322             : 
    1323           0 :     snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno);
    1324           0 :     ereport(DEBUG2,
    1325             :             (errmsg("removing file \"%s\"", path)));
    1326           0 :     unlink(path);
    1327             : 
    1328           0 :     LWLockRelease(shared->ControlLock);
    1329           0 : }
    1330             : 
    1331             : /*
    1332             :  * SlruScanDirectory callback
    1333             :  *      This callback reports true if there's any segment prior to the one
    1334             :  *      containing the page passed as "data".
    1335             :  */
    1336             : bool
    1337         700 : SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data)
    1338             : {
    1339         700 :     int         cutoffPage = *(int *) data;
    1340             : 
    1341         700 :     cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT;
    1342             : 
    1343         700 :     if (ctl->PagePrecedes(segpage, cutoffPage))
    1344           0 :         return true;            /* found one; don't iterate any more */
    1345             : 
    1346         700 :     return false;               /* keep going */
    1347             : }
    1348             : 
    1349             : /*
    1350             :  * SlruScanDirectory callback.
    1351             :  *      This callback deletes segments prior to the one passed in as "data".
    1352             :  */
    1353             : static bool
    1354        2784 : SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data)
    1355             : {
    1356        2784 :     int         cutoffPage = *(int *) data;
    1357             : 
    1358        2784 :     if (ctl->PagePrecedes(segpage, cutoffPage))
    1359           0 :         SlruInternalDeleteSegment(ctl, filename);
    1360             : 
    1361        2784 :     return false;               /* keep going */
    1362             : }
    1363             : 
    1364             : /*
    1365             :  * SlruScanDirectory callback.
    1366             :  *      This callback deletes all segments.
    1367             :  */
    1368             : bool
    1369        1494 : SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data)
    1370             : {
    1371        1494 :     SlruInternalDeleteSegment(ctl, filename);
    1372             : 
    1373        1494 :     return false;               /* keep going */
    1374             : }
    1375             : 
    1376             : /*
    1377             :  * Scan the SimpleLru directory and apply a callback to each file found in it.
    1378             :  *
    1379             :  * If the callback returns true, the scan is stopped.  The last return value
    1380             :  * from the callback is returned.
    1381             :  *
    1382             :  * The callback receives the following arguments: 1. the SlruCtl struct for the
    1383             :  * slru being truncated; 2. the filename being considered; 3. the page number
    1384             :  * for the first page of that file; 4. a pointer to the opaque data given to us
    1385             :  * by the caller.
    1386             :  *
    1387             :  * Note that the ordering in which the directory is scanned is not guaranteed.
    1388             :  *
    1389             :  * Note that no locking is applied.
    1390             :  */
    1391             : bool
    1392        7226 : SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
    1393             : {
    1394        7226 :     bool        retval = false;
    1395             :     DIR        *cldir;
    1396             :     struct dirent *clde;
    1397             :     int         segno;
    1398             :     int         segpage;
    1399             : 
    1400        7226 :     cldir = AllocateDir(ctl->Dir);
    1401        7226 :     while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
    1402             :     {
    1403             :         size_t      len;
    1404             : 
    1405       19430 :         len = strlen(clde->d_name);
    1406             : 
    1407       24408 :         if ((len == 4 || len == 5 || len == 6) &&
    1408        4978 :             strspn(clde->d_name, "0123456789ABCDEF") == len)
    1409             :         {
    1410        4978 :             segno = (int) strtol(clde->d_name, NULL, 16);
    1411        4978 :             segpage = segno * SLRU_PAGES_PER_SEGMENT;
    1412             : 
    1413        4978 :             elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
    1414             :                  ctl->Dir, clde->d_name);
    1415        4978 :             retval = callback(ctl, clde->d_name, segpage, data);
    1416        4978 :             if (retval)
    1417           0 :                 break;
    1418             :         }
    1419             :     }
    1420        7226 :     FreeDir(cldir);
    1421             : 
    1422        7226 :     return retval;
    1423             : }

Generated by: LCOV version 1.13