LCOV - code coverage report
Current view: top level - src/backend/storage/ipc - latch.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 228 261 87.4 %
Date: 2019-09-19 23:07:04 Functions: 19 19 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * latch.c
       4             :  *    Routines for inter-process latches
       5             :  *
       6             :  * The Unix implementation uses the so-called self-pipe trick to overcome the
       7             :  * race condition involved with poll() (or epoll_wait() on linux) and setting
       8             :  * a global flag in the signal handler. When a latch is set and the current
       9             :  * process is waiting for it, the signal handler wakes up the poll() in
      10             :  * WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
      11             :  * poll() on all platforms, and even on platforms where it does, a signal that
      12             :  * arrives just before the poll() call does not prevent poll() from entering
      13             :  * sleep. An incoming byte on a pipe however reliably interrupts the sleep,
      14             :  * and causes poll() to return immediately even if the signal arrives before
      15             :  * poll() begins.
      16             :  *
      17             :  * When SetLatch is called from the same process that owns the latch,
      18             :  * SetLatch writes the byte directly to the pipe. If it's owned by another
      19             :  * process, SIGUSR1 is sent and the signal handler in the waiting process
      20             :  * writes the byte to the pipe on behalf of the signaling process.
      21             :  *
      22             :  * The Windows implementation uses Windows events that are inherited by all
      23             :  * postmaster child processes. There's no need for the self-pipe trick there.
      24             :  *
      25             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
      26             :  * Portions Copyright (c) 1994, Regents of the University of California
      27             :  *
      28             :  * IDENTIFICATION
      29             :  *    src/backend/storage/ipc/latch.c
      30             :  *
      31             :  *-------------------------------------------------------------------------
      32             :  */
      33             : #include "postgres.h"
      34             : 
      35             : #include <fcntl.h>
      36             : #include <limits.h>
      37             : #include <signal.h>
      38             : #include <unistd.h>
      39             : #ifdef HAVE_SYS_EPOLL_H
      40             : #include <sys/epoll.h>
      41             : #endif
      42             : #ifdef HAVE_POLL_H
      43             : #include <poll.h>
      44             : #endif
      45             : 
      46             : #include "miscadmin.h"
      47             : #include "pgstat.h"
      48             : #include "port/atomics.h"
      49             : #include "portability/instr_time.h"
      50             : #include "postmaster/postmaster.h"
      51             : #include "storage/ipc.h"
      52             : #include "storage/latch.h"
      53             : #include "storage/pmsignal.h"
      54             : #include "storage/shmem.h"
      55             : 
      56             : /*
      57             :  * Select the fd readiness primitive to use. Normally the "most modern"
      58             :  * primitive supported by the OS will be used, but for testing it can be
      59             :  * useful to manually specify the used primitive.  If desired, just add a
      60             :  * define somewhere before this block.
      61             :  */
      62             : #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
      63             :     defined(WAIT_USE_WIN32)
      64             : /* don't overwrite manual choice */
      65             : #elif defined(HAVE_SYS_EPOLL_H)
      66             : #define WAIT_USE_EPOLL
      67             : #elif defined(HAVE_POLL)
      68             : #define WAIT_USE_POLL
      69             : #elif WIN32
      70             : #define WAIT_USE_WIN32
      71             : #else
      72             : #error "no wait set implementation available"
      73             : #endif
      74             : 
      75             : /* typedef in latch.h */
      76             : struct WaitEventSet
      77             : {
      78             :     int         nevents;        /* number of registered events */
      79             :     int         nevents_space;  /* maximum number of events in this set */
      80             : 
      81             :     /*
      82             :      * Array, of nevents_space length, storing the definition of events this
      83             :      * set is waiting for.
      84             :      */
      85             :     WaitEvent  *events;
      86             : 
      87             :     /*
      88             :      * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
      89             :      * said latch, and latch_pos the offset in the ->events array. This is
      90             :      * useful because we check the state of the latch before performing doing
      91             :      * syscalls related to waiting.
      92             :      */
      93             :     Latch      *latch;
      94             :     int         latch_pos;
      95             : 
      96             :     /*
      97             :      * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
      98             :      * is set so that we'll exit immediately if postmaster death is detected,
      99             :      * instead of returning.
     100             :      */
     101             :     bool        exit_on_postmaster_death;
     102             : 
     103             : #if defined(WAIT_USE_EPOLL)
     104             :     int         epoll_fd;
     105             :     /* epoll_wait returns events in a user provided arrays, allocate once */
     106             :     struct epoll_event *epoll_ret_events;
     107             : #elif defined(WAIT_USE_POLL)
     108             :     /* poll expects events to be waited on every poll() call, prepare once */
     109             :     struct pollfd *pollfds;
     110             : #elif defined(WAIT_USE_WIN32)
     111             : 
     112             :     /*
     113             :      * Array of windows events. The first element always contains
     114             :      * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
     115             :      * event->pos + 1).
     116             :      */
     117             :     HANDLE     *handles;
     118             : #endif
     119             : };
     120             : 
     121             : #ifndef WIN32
     122             : /* Are we currently in WaitLatch? The signal handler would like to know. */
     123             : static volatile sig_atomic_t waiting = false;
     124             : 
     125             : /* Read and write ends of the self-pipe */
     126             : static int  selfpipe_readfd = -1;
     127             : static int  selfpipe_writefd = -1;
     128             : 
     129             : /* Process owning the self-pipe --- needed for checking purposes */
     130             : static int  selfpipe_owner_pid = 0;
     131             : 
     132             : /* Private function prototypes */
     133             : static void sendSelfPipeByte(void);
     134             : static void drainSelfPipe(void);
     135             : #endif                          /* WIN32 */
     136             : 
     137             : #if defined(WAIT_USE_EPOLL)
     138             : static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
     139             : #elif defined(WAIT_USE_POLL)
     140             : static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
     141             : #elif defined(WAIT_USE_WIN32)
     142             : static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
     143             : #endif
     144             : 
     145             : static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
     146             :                                         WaitEvent *occurred_events, int nevents);
     147             : 
     148             : /*
     149             :  * Initialize the process-local latch infrastructure.
     150             :  *
     151             :  * This must be called once during startup of any process that can wait on
     152             :  * latches, before it issues any InitLatch() or OwnLatch() calls.
     153             :  */
     154             : void
     155       13028 : InitializeLatchSupport(void)
     156             : {
     157             : #ifndef WIN32
     158             :     int         pipefd[2];
     159             : 
     160       13028 :     if (IsUnderPostmaster)
     161             :     {
     162             :         /*
     163             :          * We might have inherited connections to a self-pipe created by the
     164             :          * postmaster.  It's critical that child processes create their own
     165             :          * self-pipes, of course, and we really want them to close the
     166             :          * inherited FDs for safety's sake.
     167             :          */
     168       11746 :         if (selfpipe_owner_pid != 0)
     169             :         {
     170             :             /* Assert we go through here but once in a child process */
     171             :             Assert(selfpipe_owner_pid != MyProcPid);
     172             :             /* Release postmaster's pipe FDs; ignore any error */
     173           0 :             (void) close(selfpipe_readfd);
     174           0 :             (void) close(selfpipe_writefd);
     175             :             /* Clean up, just for safety's sake; we'll set these below */
     176           0 :             selfpipe_readfd = selfpipe_writefd = -1;
     177           0 :             selfpipe_owner_pid = 0;
     178             :         }
     179             :         else
     180             :         {
     181             :             /*
     182             :              * Postmaster didn't create a self-pipe ... or else we're in an
     183             :              * EXEC_BACKEND build, in which case it doesn't matter since the
     184             :              * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
     185             :              */
     186             :             Assert(selfpipe_readfd == -1);
     187             :         }
     188             :     }
     189             :     else
     190             :     {
     191             :         /* In postmaster or standalone backend, assert we do this but once */
     192             :         Assert(selfpipe_readfd == -1);
     193             :         Assert(selfpipe_owner_pid == 0);
     194             :     }
     195             : 
     196             :     /*
     197             :      * Set up the self-pipe that allows a signal handler to wake up the
     198             :      * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
     199             :      * that SetLatch won't block if the event has already been set many times
     200             :      * filling the kernel buffer. Make the read-end non-blocking too, so that
     201             :      * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
     202             :      * Also, make both FDs close-on-exec, since we surely do not want any
     203             :      * child processes messing with them.
     204             :      */
     205       13028 :     if (pipe(pipefd) < 0)
     206           0 :         elog(FATAL, "pipe() failed: %m");
     207       13028 :     if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
     208           0 :         elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
     209       13028 :     if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
     210           0 :         elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
     211       13028 :     if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
     212           0 :         elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
     213       13028 :     if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
     214           0 :         elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
     215             : 
     216       13028 :     selfpipe_readfd = pipefd[0];
     217       13028 :     selfpipe_writefd = pipefd[1];
     218       13028 :     selfpipe_owner_pid = MyProcPid;
     219             : #else
     220             :     /* currently, nothing to do here for Windows */
     221             : #endif
     222       13028 : }
     223             : 
     224             : /*
     225             :  * Initialize a process-local latch.
     226             :  */
     227             : void
     228       13028 : InitLatch(Latch *latch)
     229             : {
     230       13028 :     latch->is_set = false;
     231       13028 :     latch->owner_pid = MyProcPid;
     232       13028 :     latch->is_shared = false;
     233             : 
     234             : #ifndef WIN32
     235             :     /* Assert InitializeLatchSupport has been called in this process */
     236             :     Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
     237             : #else
     238             :     latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
     239             :     if (latch->event == NULL)
     240             :         elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
     241             : #endif                          /* WIN32 */
     242       13028 : }
     243             : 
     244             : /*
     245             :  * Initialize a shared latch that can be set from other processes. The latch
     246             :  * is initially owned by no-one; use OwnLatch to associate it with the
     247             :  * current process.
     248             :  *
     249             :  * InitSharedLatch needs to be called in postmaster before forking child
     250             :  * processes, usually right after allocating the shared memory block
     251             :  * containing the latch with ShmemInitStruct. (The Unix implementation
     252             :  * doesn't actually require that, but the Windows one does.) Because of
     253             :  * this restriction, we have no concurrency issues to worry about here.
     254             :  *
     255             :  * Note that other handles created in this module are never marked as
     256             :  * inheritable.  Thus we do not need to worry about cleaning up child
     257             :  * process references to postmaster-private latches or WaitEventSets.
     258             :  */
     259             : void
     260      207548 : InitSharedLatch(Latch *latch)
     261             : {
     262             : #ifdef WIN32
     263             :     SECURITY_ATTRIBUTES sa;
     264             : 
     265             :     /*
     266             :      * Set up security attributes to specify that the events are inherited.
     267             :      */
     268             :     ZeroMemory(&sa, sizeof(sa));
     269             :     sa.nLength = sizeof(sa);
     270             :     sa.bInheritHandle = TRUE;
     271             : 
     272             :     latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
     273             :     if (latch->event == NULL)
     274             :         elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
     275             : #endif
     276             : 
     277      207548 :     latch->is_set = false;
     278      207548 :     latch->owner_pid = 0;
     279      207548 :     latch->is_shared = true;
     280      207548 : }
     281             : 
     282             : /*
     283             :  * Associate a shared latch with the current process, allowing it to
     284             :  * wait on the latch.
     285             :  *
     286             :  * Although there is a sanity check for latch-already-owned, we don't do
     287             :  * any sort of locking here, meaning that we could fail to detect the error
     288             :  * if two processes try to own the same latch at about the same time.  If
     289             :  * there is any risk of that, caller must provide an interlock to prevent it.
     290             :  *
     291             :  * In any process that calls OwnLatch(), make sure that
     292             :  * latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
     293             :  * as shared latches use SIGUSR1 for inter-process communication.
     294             :  */
     295             : void
     296       11788 : OwnLatch(Latch *latch)
     297             : {
     298             :     /* Sanity checks */
     299             :     Assert(latch->is_shared);
     300             : 
     301             : #ifndef WIN32
     302             :     /* Assert InitializeLatchSupport has been called in this process */
     303             :     Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
     304             : #endif
     305             : 
     306       11788 :     if (latch->owner_pid != 0)
     307           0 :         elog(ERROR, "latch already owned");
     308             : 
     309       11788 :     latch->owner_pid = MyProcPid;
     310       11788 : }
     311             : 
     312             : /*
     313             :  * Disown a shared latch currently owned by the current process.
     314             :  */
     315             : void
     316       11766 : DisownLatch(Latch *latch)
     317             : {
     318             :     Assert(latch->is_shared);
     319             :     Assert(latch->owner_pid == MyProcPid);
     320             : 
     321       11766 :     latch->owner_pid = 0;
     322       11766 : }
     323             : 
     324             : /*
     325             :  * Wait for a given latch to be set, or for postmaster death, or until timeout
     326             :  * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
     327             :  * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
     328             :  * function returns immediately.
     329             :  *
     330             :  * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
     331             :  * is given.  Although it is declared as "long", we don't actually support
     332             :  * timeouts longer than INT_MAX milliseconds.  Note that some extra overhead
     333             :  * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
     334             :  *
     335             :  * The latch must be owned by the current process, ie. it must be a
     336             :  * process-local latch initialized with InitLatch, or a shared latch
     337             :  * associated with the current process by calling OwnLatch.
     338             :  *
     339             :  * Returns bit mask indicating which condition(s) caused the wake-up. Note
     340             :  * that if multiple wake-up conditions are true, there is no guarantee that
     341             :  * we return all of them in one call, but we will return at least one.
     342             :  */
     343             : int
     344      748468 : WaitLatch(Latch *latch, int wakeEvents, long timeout,
     345             :           uint32 wait_event_info)
     346             : {
     347      748468 :     return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
     348             :                              wait_event_info);
     349             : }
     350             : 
     351             : /*
     352             :  * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
     353             :  * conditions.
     354             :  *
     355             :  * When waiting on a socket, EOF and error conditions always cause the socket
     356             :  * to be reported as readable/writable/connected, so that the caller can deal
     357             :  * with the condition.
     358             :  *
     359             :  * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
     360             :  * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
     361             :  * return value if the postmaster dies.  The latter is useful for rare cases
     362             :  * where some behavior other than immediate exit is needed.
     363             :  *
     364             :  * NB: These days this is just a wrapper around the WaitEventSet API. When
     365             :  * using a latch very frequently, consider creating a longer living
     366             :  * WaitEventSet instead; that's more efficient.
     367             :  */
     368             : int
     369      835758 : WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
     370             :                   long timeout, uint32 wait_event_info)
     371             : {
     372      835758 :     int         ret = 0;
     373             :     int         rc;
     374             :     WaitEvent   event;
     375      835758 :     WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
     376             : 
     377      835758 :     if (wakeEvents & WL_TIMEOUT)
     378             :         Assert(timeout >= 0);
     379             :     else
     380      793988 :         timeout = -1;
     381             : 
     382      835758 :     if (wakeEvents & WL_LATCH_SET)
     383      835704 :         AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
     384             :                           latch, NULL);
     385             : 
     386             :     /* Postmaster-managed callers must handle postmaster death somehow. */
     387             :     Assert(!IsUnderPostmaster ||
     388             :            (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
     389             :            (wakeEvents & WL_POSTMASTER_DEATH));
     390             : 
     391      835758 :     if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
     392       66980 :         AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
     393             :                           NULL, NULL);
     394             : 
     395      835758 :     if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
     396      768778 :         AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
     397             :                           NULL, NULL);
     398             : 
     399      835758 :     if (wakeEvents & WL_SOCKET_MASK)
     400             :     {
     401             :         int         ev;
     402             : 
     403       87290 :         ev = wakeEvents & WL_SOCKET_MASK;
     404       87290 :         AddWaitEventToSet(set, ev, sock, NULL, NULL);
     405             :     }
     406             : 
     407      835758 :     rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
     408             : 
     409      835718 :     if (rc == 0)
     410       17556 :         ret |= WL_TIMEOUT;
     411             :     else
     412             :     {
     413      818162 :         ret |= event.events & (WL_LATCH_SET |
     414             :                                WL_POSTMASTER_DEATH |
     415             :                                WL_SOCKET_MASK);
     416             :     }
     417             : 
     418      835718 :     FreeWaitEventSet(set);
     419             : 
     420      835718 :     return ret;
     421             : }
     422             : 
     423             : /*
     424             :  * Sets a latch and wakes up anyone waiting on it.
     425             :  *
     426             :  * This is cheap if the latch is already set, otherwise not so much.
     427             :  *
     428             :  * NB: when calling this in a signal handler, be sure to save and restore
     429             :  * errno around it.  (That's standard practice in most signal handlers, of
     430             :  * course, but we used to omit it in handlers that only set a flag.)
     431             :  *
     432             :  * NB: this function is called from critical sections and signal handlers so
     433             :  * throwing an error is not a good idea.
     434             :  */
     435             : void
     436     2252992 : SetLatch(Latch *latch)
     437             : {
     438             : #ifndef WIN32
     439             :     pid_t       owner_pid;
     440             : #else
     441             :     HANDLE      handle;
     442             : #endif
     443             : 
     444             :     /*
     445             :      * The memory barrier has to be placed here to ensure that any flag
     446             :      * variables possibly changed by this process have been flushed to main
     447             :      * memory, before we check/set is_set.
     448             :      */
     449     2252992 :     pg_memory_barrier();
     450             : 
     451             :     /* Quick exit if already set */
     452     2252992 :     if (latch->is_set)
     453     1487648 :         return;
     454             : 
     455      765344 :     latch->is_set = true;
     456             : 
     457             : #ifndef WIN32
     458             : 
     459             :     /*
     460             :      * See if anyone's waiting for the latch. It can be the current process if
     461             :      * we're in a signal handler. We use the self-pipe to wake up the
     462             :      * poll()/epoll_wait() in that case. If it's another process, send a
     463             :      * signal.
     464             :      *
     465             :      * Fetch owner_pid only once, in case the latch is concurrently getting
     466             :      * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
     467             :      * guaranteed to be true! In practice, the effective range of pid_t fits
     468             :      * in a 32 bit integer, and so should be atomic. In the worst case, we
     469             :      * might end up signaling the wrong process. Even then, you're very
     470             :      * unlucky if a process with that bogus pid exists and belongs to
     471             :      * Postgres; and PG database processes should handle excess SIGUSR1
     472             :      * interrupts without a problem anyhow.
     473             :      *
     474             :      * Another sort of race condition that's possible here is for a new
     475             :      * process to own the latch immediately after we look, so we don't signal
     476             :      * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
     477             :      * the standard coding convention of waiting at the bottom of their loops,
     478             :      * not the top, so that they'll correctly process latch-setting events
     479             :      * that happen before they enter the loop.
     480             :      */
     481      765344 :     owner_pid = latch->owner_pid;
     482      765344 :     if (owner_pid == 0)
     483          28 :         return;
     484      765316 :     else if (owner_pid == MyProcPid)
     485             :     {
     486       38130 :         if (waiting)
     487       11072 :             sendSelfPipeByte();
     488             :     }
     489             :     else
     490      727186 :         kill(owner_pid, SIGUSR1);
     491             : #else
     492             : 
     493             :     /*
     494             :      * See if anyone's waiting for the latch. It can be the current process if
     495             :      * we're in a signal handler.
     496             :      *
     497             :      * Use a local variable here just in case somebody changes the event field
     498             :      * concurrently (which really should not happen).
     499             :      */
     500             :     handle = latch->event;
     501             :     if (handle)
     502             :     {
     503             :         SetEvent(handle);
     504             : 
     505             :         /*
     506             :          * Note that we silently ignore any errors. We might be in a signal
     507             :          * handler or other critical path where it's not safe to call elog().
     508             :          */
     509             :     }
     510             : #endif
     511             : 
     512             : }
     513             : 
     514             : /*
     515             :  * Clear the latch. Calling WaitLatch after this will sleep, unless
     516             :  * the latch is set again before the WaitLatch call.
     517             :  */
     518             : void
     519      871880 : ResetLatch(Latch *latch)
     520             : {
     521             :     /* Only the owner should reset the latch */
     522             :     Assert(latch->owner_pid == MyProcPid);
     523             : 
     524      871880 :     latch->is_set = false;
     525             : 
     526             :     /*
     527             :      * Ensure that the write to is_set gets flushed to main memory before we
     528             :      * examine any flag variables.  Otherwise a concurrent SetLatch might
     529             :      * falsely conclude that it needn't signal us, even though we have missed
     530             :      * seeing some flag updates that SetLatch was supposed to inform us of.
     531             :      */
     532      871880 :     pg_memory_barrier();
     533      871880 : }
     534             : 
     535             : /*
     536             :  * Create a WaitEventSet with space for nevents different events to wait for.
     537             :  *
     538             :  * These events can then be efficiently waited upon together, using
     539             :  * WaitEventSetWait().
     540             :  */
     541             : WaitEventSet *
     542      843310 : CreateWaitEventSet(MemoryContext context, int nevents)
     543             : {
     544             :     WaitEventSet *set;
     545             :     char       *data;
     546      843310 :     Size        sz = 0;
     547             : 
     548             :     /*
     549             :      * Use MAXALIGN size/alignment to guarantee that later uses of memory are
     550             :      * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
     551             :      * platforms, but earlier allocations like WaitEventSet and WaitEvent
     552             :      * might not sized to guarantee that when purely using sizeof().
     553             :      */
     554      843310 :     sz += MAXALIGN(sizeof(WaitEventSet));
     555      843310 :     sz += MAXALIGN(sizeof(WaitEvent) * nevents);
     556             : 
     557             : #if defined(WAIT_USE_EPOLL)
     558      843310 :     sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
     559             : #elif defined(WAIT_USE_POLL)
     560             :     sz += MAXALIGN(sizeof(struct pollfd) * nevents);
     561             : #elif defined(WAIT_USE_WIN32)
     562             :     /* need space for the pgwin32_signal_event */
     563             :     sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
     564             : #endif
     565             : 
     566      843310 :     data = (char *) MemoryContextAllocZero(context, sz);
     567             : 
     568      843310 :     set = (WaitEventSet *) data;
     569      843310 :     data += MAXALIGN(sizeof(WaitEventSet));
     570             : 
     571      843310 :     set->events = (WaitEvent *) data;
     572      843310 :     data += MAXALIGN(sizeof(WaitEvent) * nevents);
     573             : 
     574             : #if defined(WAIT_USE_EPOLL)
     575      843310 :     set->epoll_ret_events = (struct epoll_event *) data;
     576      843310 :     data += MAXALIGN(sizeof(struct epoll_event) * nevents);
     577             : #elif defined(WAIT_USE_POLL)
     578             :     set->pollfds = (struct pollfd *) data;
     579             :     data += MAXALIGN(sizeof(struct pollfd) * nevents);
     580             : #elif defined(WAIT_USE_WIN32)
     581             :     set->handles = (HANDLE) data;
     582             :     data += MAXALIGN(sizeof(HANDLE) * nevents);
     583             : #endif
     584             : 
     585      843310 :     set->latch = NULL;
     586      843310 :     set->nevents_space = nevents;
     587      843310 :     set->exit_on_postmaster_death = false;
     588             : 
     589             : #if defined(WAIT_USE_EPOLL)
     590             : #ifdef EPOLL_CLOEXEC
     591      843310 :     set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
     592      843310 :     if (set->epoll_fd < 0)
     593           0 :         elog(ERROR, "epoll_create1 failed: %m");
     594             : #else
     595             :     /* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
     596             :     set->epoll_fd = epoll_create(nevents);
     597             :     if (set->epoll_fd < 0)
     598             :         elog(ERROR, "epoll_create failed: %m");
     599             :     if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
     600             :         elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
     601             : #endif                          /* EPOLL_CLOEXEC */
     602             : #elif defined(WAIT_USE_WIN32)
     603             : 
     604             :     /*
     605             :      * To handle signals while waiting, we need to add a win32 specific event.
     606             :      * We accounted for the additional event at the top of this routine. See
     607             :      * port/win32/signal.c for more details.
     608             :      *
     609             :      * Note: pgwin32_signal_event should be first to ensure that it will be
     610             :      * reported when multiple events are set.  We want to guarantee that
     611             :      * pending signals are serviced.
     612             :      */
     613             :     set->handles[0] = pgwin32_signal_event;
     614             :     StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
     615             : #endif
     616             : 
     617      843310 :     return set;
     618             : }
     619             : 
     620             : /*
     621             :  * Free a previously created WaitEventSet.
     622             :  *
     623             :  * Note: preferably, this shouldn't have to free any resources that could be
     624             :  * inherited across an exec().  If it did, we'd likely leak those resources in
     625             :  * many scenarios.  For the epoll case, we ensure that by setting FD_CLOEXEC
     626             :  * when the FD is created.  For the Windows case, we assume that the handles
     627             :  * involved are non-inheritable.
     628             :  */
     629             : void
     630      835718 : FreeWaitEventSet(WaitEventSet *set)
     631             : {
     632             : #if defined(WAIT_USE_EPOLL)
     633      835718 :     close(set->epoll_fd);
     634             : #elif defined(WAIT_USE_WIN32)
     635             :     WaitEvent  *cur_event;
     636             : 
     637             :     for (cur_event = set->events;
     638             :          cur_event < (set->events + set->nevents);
     639             :          cur_event++)
     640             :     {
     641             :         if (cur_event->events & WL_LATCH_SET)
     642             :         {
     643             :             /* uses the latch's HANDLE */
     644             :         }
     645             :         else if (cur_event->events & WL_POSTMASTER_DEATH)
     646             :         {
     647             :             /* uses PostmasterHandle */
     648             :         }
     649             :         else
     650             :         {
     651             :             /* Clean up the event object we created for the socket */
     652             :             WSAEventSelect(cur_event->fd, NULL, 0);
     653             :             WSACloseEvent(set->handles[cur_event->pos + 1]);
     654             :         }
     655             :     }
     656             : #endif
     657             : 
     658      835718 :     pfree(set);
     659      835718 : }
     660             : 
     661             : /* ---
     662             :  * Add an event to the set. Possible events are:
     663             :  * - WL_LATCH_SET: Wait for the latch to be set
     664             :  * - WL_POSTMASTER_DEATH: Wait for postmaster to die
     665             :  * - WL_SOCKET_READABLE: Wait for socket to become readable,
     666             :  *   can be combined in one event with other WL_SOCKET_* events
     667             :  * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
     668             :  *   can be combined with other WL_SOCKET_* events
     669             :  * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
     670             :  *   can be combined with other WL_SOCKET_* events (on non-Windows
     671             :  *   platforms, this is the same as WL_SOCKET_WRITEABLE)
     672             :  * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
     673             :  *
     674             :  * Returns the offset in WaitEventSet->events (starting from 0), which can be
     675             :  * used to modify previously added wait events using ModifyWaitEvent().
     676             :  *
     677             :  * In the WL_LATCH_SET case the latch must be owned by the current process,
     678             :  * i.e. it must be a process-local latch initialized with InitLatch, or a
     679             :  * shared latch associated with the current process by calling OwnLatch.
     680             :  *
     681             :  * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
     682             :  * conditions cause the socket to be reported as readable/writable/connected,
     683             :  * so that the caller can deal with the condition.
     684             :  *
     685             :  * The user_data pointer specified here will be set for the events returned
     686             :  * by WaitEventSetWait(), allowing to easily associate additional data with
     687             :  * events.
     688             :  */
     689             : int
     690     1780658 : AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
     691             :                   void *user_data)
     692             : {
     693             :     WaitEvent  *event;
     694             : 
     695             :     /* not enough space */
     696             :     Assert(set->nevents < set->nevents_space);
     697             : 
     698     1780658 :     if (events == WL_EXIT_ON_PM_DEATH)
     699             :     {
     700      769526 :         events = WL_POSTMASTER_DEATH;
     701      769526 :         set->exit_on_postmaster_death = true;
     702             :     }
     703             : 
     704     1780658 :     if (latch)
     705             :     {
     706      843256 :         if (latch->owner_pid != MyProcPid)
     707           0 :             elog(ERROR, "cannot wait on a latch owned by another process");
     708      843256 :         if (set->latch)
     709           0 :             elog(ERROR, "cannot wait on more than one latch");
     710      843256 :         if ((events & WL_LATCH_SET) != WL_LATCH_SET)
     711           0 :             elog(ERROR, "latch events only support being set");
     712             :     }
     713             :     else
     714             :     {
     715      937402 :         if (events & WL_LATCH_SET)
     716           0 :             elog(ERROR, "cannot wait on latch without a specified latch");
     717             :     }
     718             : 
     719             :     /* waiting for socket readiness without a socket indicates a bug */
     720     1780658 :     if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
     721           0 :         elog(ERROR, "cannot wait on socket event without a socket");
     722             : 
     723     1780658 :     event = &set->events[set->nevents];
     724     1780658 :     event->pos = set->nevents++;
     725     1780658 :     event->fd = fd;
     726     1780658 :     event->events = events;
     727     1780658 :     event->user_data = user_data;
     728             : #ifdef WIN32
     729             :     event->reset = false;
     730             : #endif
     731             : 
     732     1780658 :     if (events == WL_LATCH_SET)
     733             :     {
     734      843256 :         set->latch = latch;
     735      843256 :         set->latch_pos = event->pos;
     736             : #ifndef WIN32
     737      843256 :         event->fd = selfpipe_readfd;
     738             : #endif
     739             :     }
     740      937402 :     else if (events == WL_POSTMASTER_DEATH)
     741             :     {
     742             : #ifndef WIN32
     743      843308 :         event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
     744             : #endif
     745             :     }
     746             : 
     747             :     /* perform wait primitive specific initialization, if needed */
     748             : #if defined(WAIT_USE_EPOLL)
     749     1780658 :     WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
     750             : #elif defined(WAIT_USE_POLL)
     751             :     WaitEventAdjustPoll(set, event);
     752             : #elif defined(WAIT_USE_WIN32)
     753             :     WaitEventAdjustWin32(set, event);
     754             : #endif
     755             : 
     756     1780658 :     return event->pos;
     757             : }
     758             : 
     759             : /*
     760             :  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
     761             :  * with the WaitEvent.
     762             :  *
     763             :  * 'pos' is the id returned by AddWaitEventToSet.
     764             :  */
     765             : void
     766      179948 : ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
     767             : {
     768             :     WaitEvent  *event;
     769             : 
     770             :     Assert(pos < set->nevents);
     771             : 
     772      179948 :     event = &set->events[pos];
     773             : 
     774             :     /*
     775             :      * If neither the event mask nor the associated latch changes, return
     776             :      * early. That's an important optimization for some sockets, where
     777             :      * ModifyWaitEvent is frequently used to switch from waiting for reads to
     778             :      * waiting on writes.
     779             :      */
     780      355118 :     if (events == event->events &&
     781      188626 :         (!(event->events & WL_LATCH_SET) || set->latch == latch))
     782      161714 :         return;
     783             : 
     784       31690 :     if (event->events & WL_LATCH_SET &&
     785       13456 :         events != event->events)
     786             :     {
     787             :         /* we could allow to disable latch events for a while */
     788           0 :         elog(ERROR, "cannot modify latch event");
     789             :     }
     790             : 
     791       18234 :     if (event->events & WL_POSTMASTER_DEATH)
     792             :     {
     793           0 :         elog(ERROR, "cannot modify postmaster death event");
     794             :     }
     795             : 
     796             :     /* FIXME: validate event mask */
     797       18234 :     event->events = events;
     798             : 
     799       18234 :     if (events == WL_LATCH_SET)
     800             :     {
     801       13456 :         set->latch = latch;
     802             :     }
     803             : 
     804             : #if defined(WAIT_USE_EPOLL)
     805       18234 :     WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
     806             : #elif defined(WAIT_USE_POLL)
     807             :     WaitEventAdjustPoll(set, event);
     808             : #elif defined(WAIT_USE_WIN32)
     809             :     WaitEventAdjustWin32(set, event);
     810             : #endif
     811             : }
     812             : 
     813             : #if defined(WAIT_USE_EPOLL)
     814             : /*
     815             :  * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
     816             :  */
     817             : static void
     818     1798892 : WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
     819             : {
     820             :     struct epoll_event epoll_ev;
     821             :     int         rc;
     822             : 
     823             :     /* pointer to our event, returned by epoll_wait */
     824     1798892 :     epoll_ev.data.ptr = event;
     825             :     /* always wait for errors */
     826     1798892 :     epoll_ev.events = EPOLLERR | EPOLLHUP;
     827             : 
     828             :     /* prepare pollfd entry once */
     829     1798892 :     if (event->events == WL_LATCH_SET)
     830             :     {
     831             :         Assert(set->latch != NULL);
     832      856712 :         epoll_ev.events |= EPOLLIN;
     833             :     }
     834      942180 :     else if (event->events == WL_POSTMASTER_DEATH)
     835             :     {
     836      843308 :         epoll_ev.events |= EPOLLIN;
     837             :     }
     838             :     else
     839             :     {
     840             :         Assert(event->fd != PGINVALID_SOCKET);
     841             :         Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
     842             : 
     843       98872 :         if (event->events & WL_SOCKET_READABLE)
     844       91472 :             epoll_ev.events |= EPOLLIN;
     845       98872 :         if (event->events & WL_SOCKET_WRITEABLE)
     846        9206 :             epoll_ev.events |= EPOLLOUT;
     847             :     }
     848             : 
     849             :     /*
     850             :      * Even though unused, we also pass epoll_ev as the data argument if
     851             :      * EPOLL_CTL_DEL is passed as action.  There used to be an epoll bug
     852             :      * requiring that, and actually it makes the code simpler...
     853             :      */
     854     1798892 :     rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
     855             : 
     856     1798892 :     if (rc < 0)
     857           0 :         ereport(ERROR,
     858             :                 (errcode_for_socket_access(),
     859             :         /* translator: %s is a syscall name, such as "poll()" */
     860             :                  errmsg("%s failed: %m",
     861             :                         "epoll_ctl()")));
     862     1798892 : }
     863             : #endif
     864             : 
     865             : #if defined(WAIT_USE_POLL)
     866             : static void
     867             : WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
     868             : {
     869             :     struct pollfd *pollfd = &set->pollfds[event->pos];
     870             : 
     871             :     pollfd->revents = 0;
     872             :     pollfd->fd = event->fd;
     873             : 
     874             :     /* prepare pollfd entry once */
     875             :     if (event->events == WL_LATCH_SET)
     876             :     {
     877             :         Assert(set->latch != NULL);
     878             :         pollfd->events = POLLIN;
     879             :     }
     880             :     else if (event->events == WL_POSTMASTER_DEATH)
     881             :     {
     882             :         pollfd->events = POLLIN;
     883             :     }
     884             :     else
     885             :     {
     886             :         Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
     887             :         pollfd->events = 0;
     888             :         if (event->events & WL_SOCKET_READABLE)
     889             :             pollfd->events |= POLLIN;
     890             :         if (event->events & WL_SOCKET_WRITEABLE)
     891             :             pollfd->events |= POLLOUT;
     892             :     }
     893             : 
     894             :     Assert(event->fd != PGINVALID_SOCKET);
     895             : }
     896             : #endif
     897             : 
     898             : #if defined(WAIT_USE_WIN32)
     899             : static void
     900             : WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
     901             : {
     902             :     HANDLE     *handle = &set->handles[event->pos + 1];
     903             : 
     904             :     if (event->events == WL_LATCH_SET)
     905             :     {
     906             :         Assert(set->latch != NULL);
     907             :         *handle = set->latch->event;
     908             :     }
     909             :     else if (event->events == WL_POSTMASTER_DEATH)
     910             :     {
     911             :         *handle = PostmasterHandle;
     912             :     }
     913             :     else
     914             :     {
     915             :         int         flags = FD_CLOSE;   /* always check for errors/EOF */
     916             : 
     917             :         if (event->events & WL_SOCKET_READABLE)
     918             :             flags |= FD_READ;
     919             :         if (event->events & WL_SOCKET_WRITEABLE)
     920             :             flags |= FD_WRITE;
     921             :         if (event->events & WL_SOCKET_CONNECTED)
     922             :             flags |= FD_CONNECT;
     923             : 
     924             :         if (*handle == WSA_INVALID_EVENT)
     925             :         {
     926             :             *handle = WSACreateEvent();
     927             :             if (*handle == WSA_INVALID_EVENT)
     928             :                 elog(ERROR, "failed to create event for socket: error code %u",
     929             :                      WSAGetLastError());
     930             :         }
     931             :         if (WSAEventSelect(event->fd, *handle, flags) != 0)
     932             :             elog(ERROR, "failed to set up event for socket: error code %u",
     933             :                  WSAGetLastError());
     934             : 
     935             :         Assert(event->fd != PGINVALID_SOCKET);
     936             :     }
     937             : }
     938             : #endif
     939             : 
     940             : /*
     941             :  * Wait for events added to the set to happen, or until the timeout is
     942             :  * reached.  At most nevents occurred events are returned.
     943             :  *
     944             :  * If timeout = -1, block until an event occurs; if 0, check sockets for
     945             :  * readiness, but don't block; if > 0, block for at most timeout milliseconds.
     946             :  *
     947             :  * Returns the number of events occurred, or 0 if the timeout was reached.
     948             :  *
     949             :  * Returned events will have the fd, pos, user_data fields set to the
     950             :  * values associated with the registered event.
     951             :  */
     952             : int
     953     1003478 : WaitEventSetWait(WaitEventSet *set, long timeout,
     954             :                  WaitEvent *occurred_events, int nevents,
     955             :                  uint32 wait_event_info)
     956             : {
     957     1003478 :     int         returned_events = 0;
     958             :     instr_time  start_time;
     959             :     instr_time  cur_time;
     960     1003478 :     long        cur_timeout = -1;
     961             : 
     962             :     Assert(nevents > 0);
     963             : 
     964             :     /*
     965             :      * Initialize timeout if requested.  We must record the current time so
     966             :      * that we can determine the remaining timeout if interrupted.
     967             :      */
     968     1003478 :     if (timeout >= 0)
     969             :     {
     970       41796 :         INSTR_TIME_SET_CURRENT(start_time);
     971             :         Assert(timeout >= 0 && timeout <= INT_MAX);
     972       41796 :         cur_timeout = timeout;
     973             :     }
     974             : 
     975     1003478 :     pgstat_report_wait_start(wait_event_info);
     976             : 
     977             : #ifndef WIN32
     978     1003478 :     waiting = true;
     979             : #else
     980             :     /* Ensure that signals are serviced even if latch is already set */
     981             :     pgwin32_dispatch_queued_signals();
     982             : #endif
     983     3566066 :     while (returned_events == 0)
     984             :     {
     985             :         int         rc;
     986             : 
     987             :         /*
     988             :          * Check if the latch is set already. If so, leave the loop
     989             :          * immediately, avoid blocking again. We don't attempt to report any
     990             :          * other events that might also be satisfied.
     991             :          *
     992             :          * If someone sets the latch between this and the
     993             :          * WaitEventSetWaitBlock() below, the setter will write a byte to the
     994             :          * pipe (or signal us and the signal handler will do that), and the
     995             :          * readiness routine will return immediately.
     996             :          *
     997             :          * On unix, If there's a pending byte in the self pipe, we'll notice
     998             :          * whenever blocking. Only clearing the pipe in that case avoids
     999             :          * having to drain it every time WaitLatchOrSocket() is used. Should
    1000             :          * the pipe-buffer fill up we're still ok, because the pipe is in
    1001             :          * nonblocking mode. It's unlikely for that to happen, because the
    1002             :          * self pipe isn't filled unless we're blocking (waiting = true), or
    1003             :          * from inside a signal handler in latch_sigusr1_handler().
    1004             :          *
    1005             :          * On windows, we'll also notice if there's a pending event for the
    1006             :          * latch when blocking, but there's no danger of anything filling up,
    1007             :          * as "Setting an event that is already set has no effect.".
    1008             :          *
    1009             :          * Note: we assume that the kernel calls involved in latch management
    1010             :          * will provide adequate synchronization on machines with weak memory
    1011             :          * ordering, so that we cannot miss seeing is_set if a notification
    1012             :          * has already been queued.
    1013             :          */
    1014     2318726 :         if (set->latch && set->latch->is_set)
    1015             :         {
    1016      742020 :             occurred_events->fd = PGINVALID_SOCKET;
    1017      742020 :             occurred_events->pos = set->latch_pos;
    1018      742020 :             occurred_events->user_data =
    1019      742020 :                 set->events[set->latch_pos].user_data;
    1020      742020 :             occurred_events->events = WL_LATCH_SET;
    1021      742020 :             occurred_events++;
    1022      742020 :             returned_events++;
    1023             : 
    1024      742020 :             break;
    1025             :         }
    1026             : 
    1027             :         /*
    1028             :          * Wait for events using the readiness primitive chosen at the top of
    1029             :          * this file. If -1 is returned, a timeout has occurred, if 0 we have
    1030             :          * to retry, everything >= 1 is the number of returned events.
    1031             :          */
    1032     1576706 :         rc = WaitEventSetWaitBlock(set, cur_timeout,
    1033             :                                    occurred_events, nevents);
    1034             : 
    1035     1576666 :         if (rc == -1)
    1036       17552 :             break;              /* timeout occurred */
    1037             :         else
    1038     1559114 :             returned_events = rc;
    1039             : 
    1040             :         /* If we're not done, update cur_timeout for next iteration */
    1041     1559114 :         if (returned_events == 0 && timeout >= 0)
    1042             :         {
    1043       29610 :             INSTR_TIME_SET_CURRENT(cur_time);
    1044       29610 :             INSTR_TIME_SUBTRACT(cur_time, start_time);
    1045       29610 :             cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
    1046       29610 :             if (cur_timeout <= 0)
    1047           4 :                 break;
    1048             :         }
    1049             :     }
    1050             : #ifndef WIN32
    1051     1003438 :     waiting = false;
    1052             : #endif
    1053             : 
    1054     1003438 :     pgstat_report_wait_end();
    1055             : 
    1056     1003438 :     return returned_events;
    1057             : }
    1058             : 
    1059             : 
    1060             : #if defined(WAIT_USE_EPOLL)
    1061             : 
    1062             : /*
    1063             :  * Wait using linux's epoll_wait(2).
    1064             :  *
    1065             :  * This is the preferable wait method, as several readiness notifications are
    1066             :  * delivered, without having to iterate through all of set->events. The return
    1067             :  * epoll_event struct contain a pointer to our events, making association
    1068             :  * easy.
    1069             :  */
    1070             : static inline int
    1071     1576706 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
    1072             :                       WaitEvent *occurred_events, int nevents)
    1073             : {
    1074     1576706 :     int         returned_events = 0;
    1075             :     int         rc;
    1076             :     WaitEvent  *cur_event;
    1077             :     struct epoll_event *cur_epoll_event;
    1078             : 
    1079             :     /* Sleep */
    1080     1576706 :     rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
    1081             :                     nevents, cur_timeout);
    1082             : 
    1083             :     /* Check return code */
    1084     1576696 :     if (rc < 0)
    1085             :     {
    1086             :         /* EINTR is okay, otherwise complain */
    1087      659484 :         if (errno != EINTR)
    1088             :         {
    1089           0 :             waiting = false;
    1090           0 :             ereport(ERROR,
    1091             :                     (errcode_for_socket_access(),
    1092             :             /* translator: %s is a syscall name, such as "poll()" */
    1093             :                      errmsg("%s failed: %m",
    1094             :                             "epoll_wait()")));
    1095             :         }
    1096      659484 :         return 0;
    1097             :     }
    1098      917212 :     else if (rc == 0)
    1099             :     {
    1100             :         /* timeout exceeded */
    1101       17552 :         return -1;
    1102             :     }
    1103             : 
    1104             :     /*
    1105             :      * At least one event occurred, iterate over the returned epoll events
    1106             :      * until they're either all processed, or we've returned all the events
    1107             :      * the caller desired.
    1108             :      */
    1109     2698950 :     for (cur_epoll_event = set->epoll_ret_events;
    1110     2698950 :          cur_epoll_event < (set->epoll_ret_events + rc) &&
    1111             :          returned_events < nevents;
    1112      899630 :          cur_epoll_event++)
    1113             :     {
    1114             :         /* epoll's data pointer is set to the associated WaitEvent */
    1115      899660 :         cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
    1116             : 
    1117      899660 :         occurred_events->pos = cur_event->pos;
    1118      899660 :         occurred_events->user_data = cur_event->user_data;
    1119      899660 :         occurred_events->events = 0;
    1120             : 
    1121     1557122 :         if (cur_event->events == WL_LATCH_SET &&
    1122      657462 :             cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
    1123             :         {
    1124             :             /* There's data in the self-pipe, clear it. */
    1125      657462 :             drainSelfPipe();
    1126             : 
    1127     1314924 :             if (set->latch->is_set)
    1128             :             {
    1129        1694 :                 occurred_events->fd = PGINVALID_SOCKET;
    1130        1694 :                 occurred_events->events = WL_LATCH_SET;
    1131        1694 :                 occurred_events++;
    1132        1694 :                 returned_events++;
    1133             :             }
    1134             :         }
    1135      242234 :         else if (cur_event->events == WL_POSTMASTER_DEATH &&
    1136          36 :                  cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
    1137             :         {
    1138             :             /*
    1139             :              * We expect an EPOLLHUP when the remote end is closed, but
    1140             :              * because we don't expect the pipe to become readable or to have
    1141             :              * any errors either, treat those cases as postmaster death, too.
    1142             :              *
    1143             :              * Be paranoid about a spurious event signalling the postmaster as
    1144             :              * being dead.  There have been reports about that happening with
    1145             :              * older primitives (select(2) to be specific), and a spurious
    1146             :              * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
    1147             :              * cost much.
    1148             :              */
    1149          42 :             if (!PostmasterIsAliveInternal())
    1150             :             {
    1151          36 :                 if (set->exit_on_postmaster_death)
    1152          30 :                     proc_exit(1);
    1153           6 :                 occurred_events->fd = PGINVALID_SOCKET;
    1154           6 :                 occurred_events->events = WL_POSTMASTER_DEATH;
    1155           6 :                 occurred_events++;
    1156           6 :                 returned_events++;
    1157             :             }
    1158             :         }
    1159      242162 :         else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
    1160             :         {
    1161             :             Assert(cur_event->fd != PGINVALID_SOCKET);
    1162             : 
    1163      474962 :             if ((cur_event->events & WL_SOCKET_READABLE) &&
    1164      232800 :                 (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
    1165             :             {
    1166             :                 /* data available in socket, or EOF */
    1167      231092 :                 occurred_events->events |= WL_SOCKET_READABLE;
    1168             :             }
    1169             : 
    1170      253322 :             if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
    1171       11160 :                 (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
    1172             :             {
    1173             :                 /* writable, or EOF */
    1174       11126 :                 occurred_events->events |= WL_SOCKET_WRITEABLE;
    1175             :             }
    1176             : 
    1177      242162 :             if (occurred_events->events != 0)
    1178             :             {
    1179      242162 :                 occurred_events->fd = cur_event->fd;
    1180      242162 :                 occurred_events++;
    1181      242162 :                 returned_events++;
    1182             :             }
    1183             :         }
    1184             :     }
    1185             : 
    1186      899630 :     return returned_events;
    1187             : }
    1188             : 
    1189             : #elif defined(WAIT_USE_POLL)
    1190             : 
    1191             : /*
    1192             :  * Wait using poll(2).
    1193             :  *
    1194             :  * This allows to receive readiness notifications for several events at once,
    1195             :  * but requires iterating through all of set->pollfds.
    1196             :  */
    1197             : static inline int
    1198             : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
    1199             :                       WaitEvent *occurred_events, int nevents)
    1200             : {
    1201             :     int         returned_events = 0;
    1202             :     int         rc;
    1203             :     WaitEvent  *cur_event;
    1204             :     struct pollfd *cur_pollfd;
    1205             : 
    1206             :     /* Sleep */
    1207             :     rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
    1208             : 
    1209             :     /* Check return code */
    1210             :     if (rc < 0)
    1211             :     {
    1212             :         /* EINTR is okay, otherwise complain */
    1213             :         if (errno != EINTR)
    1214             :         {
    1215             :             waiting = false;
    1216             :             ereport(ERROR,
    1217             :                     (errcode_for_socket_access(),
    1218             :             /* translator: %s is a syscall name, such as "poll()" */
    1219             :                      errmsg("%s failed: %m",
    1220             :                             "poll()")));
    1221             :         }
    1222             :         return 0;
    1223             :     }
    1224             :     else if (rc == 0)
    1225             :     {
    1226             :         /* timeout exceeded */
    1227             :         return -1;
    1228             :     }
    1229             : 
    1230             :     for (cur_event = set->events, cur_pollfd = set->pollfds;
    1231             :          cur_event < (set->events + set->nevents) &&
    1232             :          returned_events < nevents;
    1233             :          cur_event++, cur_pollfd++)
    1234             :     {
    1235             :         /* no activity on this FD, skip */
    1236             :         if (cur_pollfd->revents == 0)
    1237             :             continue;
    1238             : 
    1239             :         occurred_events->pos = cur_event->pos;
    1240             :         occurred_events->user_data = cur_event->user_data;
    1241             :         occurred_events->events = 0;
    1242             : 
    1243             :         if (cur_event->events == WL_LATCH_SET &&
    1244             :             (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
    1245             :         {
    1246             :             /* There's data in the self-pipe, clear it. */
    1247             :             drainSelfPipe();
    1248             : 
    1249             :             if (set->latch->is_set)
    1250             :             {
    1251             :                 occurred_events->fd = PGINVALID_SOCKET;
    1252             :                 occurred_events->events = WL_LATCH_SET;
    1253             :                 occurred_events++;
    1254             :                 returned_events++;
    1255             :             }
    1256             :         }
    1257             :         else if (cur_event->events == WL_POSTMASTER_DEATH &&
    1258             :                  (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
    1259             :         {
    1260             :             /*
    1261             :              * We expect an POLLHUP when the remote end is closed, but because
    1262             :              * we don't expect the pipe to become readable or to have any
    1263             :              * errors either, treat those cases as postmaster death, too.
    1264             :              *
    1265             :              * Be paranoid about a spurious event signalling the postmaster as
    1266             :              * being dead.  There have been reports about that happening with
    1267             :              * older primitives (select(2) to be specific), and a spurious
    1268             :              * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
    1269             :              * cost much.
    1270             :              */
    1271             :             if (!PostmasterIsAliveInternal())
    1272             :             {
    1273             :                 if (set->exit_on_postmaster_death)
    1274             :                     proc_exit(1);
    1275             :                 occurred_events->fd = PGINVALID_SOCKET;
    1276             :                 occurred_events->events = WL_POSTMASTER_DEATH;
    1277             :                 occurred_events++;
    1278             :                 returned_events++;
    1279             :             }
    1280             :         }
    1281             :         else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
    1282             :         {
    1283             :             int         errflags = POLLHUP | POLLERR | POLLNVAL;
    1284             : 
    1285             :             Assert(cur_event->fd >= PGINVALID_SOCKET);
    1286             : 
    1287             :             if ((cur_event->events & WL_SOCKET_READABLE) &&
    1288             :                 (cur_pollfd->revents & (POLLIN | errflags)))
    1289             :             {
    1290             :                 /* data available in socket, or EOF */
    1291             :                 occurred_events->events |= WL_SOCKET_READABLE;
    1292             :             }
    1293             : 
    1294             :             if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
    1295             :                 (cur_pollfd->revents & (POLLOUT | errflags)))
    1296             :             {
    1297             :                 /* writeable, or EOF */
    1298             :                 occurred_events->events |= WL_SOCKET_WRITEABLE;
    1299             :             }
    1300             : 
    1301             :             if (occurred_events->events != 0)
    1302             :             {
    1303             :                 occurred_events->fd = cur_event->fd;
    1304             :                 occurred_events++;
    1305             :                 returned_events++;
    1306             :             }
    1307             :         }
    1308             :     }
    1309             :     return returned_events;
    1310             : }
    1311             : 
    1312             : #elif defined(WAIT_USE_WIN32)
    1313             : 
    1314             : /*
    1315             :  * Wait using Windows' WaitForMultipleObjects().
    1316             :  *
    1317             :  * Unfortunately this will only ever return a single readiness notification at
    1318             :  * a time.  Note that while the official documentation for
    1319             :  * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
    1320             :  * with a single bWaitAll = FALSE call,
    1321             :  * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
    1322             :  * that only one event is "consumed".
    1323             :  */
    1324             : static inline int
    1325             : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
    1326             :                       WaitEvent *occurred_events, int nevents)
    1327             : {
    1328             :     int         returned_events = 0;
    1329             :     DWORD       rc;
    1330             :     WaitEvent  *cur_event;
    1331             : 
    1332             :     /* Reset any wait events that need it */
    1333             :     for (cur_event = set->events;
    1334             :          cur_event < (set->events + set->nevents);
    1335             :          cur_event++)
    1336             :     {
    1337             :         if (cur_event->reset)
    1338             :         {
    1339             :             WaitEventAdjustWin32(set, cur_event);
    1340             :             cur_event->reset = false;
    1341             :         }
    1342             : 
    1343             :         /*
    1344             :          * Windows does not guarantee to log an FD_WRITE network event
    1345             :          * indicating that more data can be sent unless the previous send()
    1346             :          * failed with WSAEWOULDBLOCK.  While our caller might well have made
    1347             :          * such a call, we cannot assume that here.  Therefore, if waiting for
    1348             :          * write-ready, force the issue by doing a dummy send().  If the dummy
    1349             :          * send() succeeds, assume that the socket is in fact write-ready, and
    1350             :          * return immediately.  Also, if it fails with something other than
    1351             :          * WSAEWOULDBLOCK, return a write-ready indication to let our caller
    1352             :          * deal with the error condition.
    1353             :          */
    1354             :         if (cur_event->events & WL_SOCKET_WRITEABLE)
    1355             :         {
    1356             :             char        c;
    1357             :             WSABUF      buf;
    1358             :             DWORD       sent;
    1359             :             int         r;
    1360             : 
    1361             :             buf.buf = &c;
    1362             :             buf.len = 0;
    1363             : 
    1364             :             r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
    1365             :             if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
    1366             :             {
    1367             :                 occurred_events->pos = cur_event->pos;
    1368             :                 occurred_events->user_data = cur_event->user_data;
    1369             :                 occurred_events->events = WL_SOCKET_WRITEABLE;
    1370             :                 occurred_events->fd = cur_event->fd;
    1371             :                 return 1;
    1372             :             }
    1373             :         }
    1374             :     }
    1375             : 
    1376             :     /*
    1377             :      * Sleep.
    1378             :      *
    1379             :      * Need to wait for ->nevents + 1, because signal handle is in [0].
    1380             :      */
    1381             :     rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
    1382             :                                 cur_timeout);
    1383             : 
    1384             :     /* Check return code */
    1385             :     if (rc == WAIT_FAILED)
    1386             :         elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
    1387             :              GetLastError());
    1388             :     else if (rc == WAIT_TIMEOUT)
    1389             :     {
    1390             :         /* timeout exceeded */
    1391             :         return -1;
    1392             :     }
    1393             : 
    1394             :     if (rc == WAIT_OBJECT_0)
    1395             :     {
    1396             :         /* Service newly-arrived signals */
    1397             :         pgwin32_dispatch_queued_signals();
    1398             :         return 0;               /* retry */
    1399             :     }
    1400             : 
    1401             :     /*
    1402             :      * With an offset of one, due to the always present pgwin32_signal_event,
    1403             :      * the handle offset directly corresponds to a wait event.
    1404             :      */
    1405             :     cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
    1406             : 
    1407             :     occurred_events->pos = cur_event->pos;
    1408             :     occurred_events->user_data = cur_event->user_data;
    1409             :     occurred_events->events = 0;
    1410             : 
    1411             :     if (cur_event->events == WL_LATCH_SET)
    1412             :     {
    1413             :         if (!ResetEvent(set->latch->event))
    1414             :             elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
    1415             : 
    1416             :         if (set->latch->is_set)
    1417             :         {
    1418             :             occurred_events->fd = PGINVALID_SOCKET;
    1419             :             occurred_events->events = WL_LATCH_SET;
    1420             :             occurred_events++;
    1421             :             returned_events++;
    1422             :         }
    1423             :     }
    1424             :     else if (cur_event->events == WL_POSTMASTER_DEATH)
    1425             :     {
    1426             :         /*
    1427             :          * Postmaster apparently died.  Since the consequences of falsely
    1428             :          * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
    1429             :          * the trouble to positively verify this with PostmasterIsAlive(),
    1430             :          * even though there is no known reason to think that the event could
    1431             :          * be falsely set on Windows.
    1432             :          */
    1433             :         if (!PostmasterIsAliveInternal())
    1434             :         {
    1435             :             if (set->exit_on_postmaster_death)
    1436             :                 proc_exit(1);
    1437             :             occurred_events->fd = PGINVALID_SOCKET;
    1438             :             occurred_events->events = WL_POSTMASTER_DEATH;
    1439             :             occurred_events++;
    1440             :             returned_events++;
    1441             :         }
    1442             :     }
    1443             :     else if (cur_event->events & WL_SOCKET_MASK)
    1444             :     {
    1445             :         WSANETWORKEVENTS resEvents;
    1446             :         HANDLE      handle = set->handles[cur_event->pos + 1];
    1447             : 
    1448             :         Assert(cur_event->fd);
    1449             : 
    1450             :         occurred_events->fd = cur_event->fd;
    1451             : 
    1452             :         ZeroMemory(&resEvents, sizeof(resEvents));
    1453             :         if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
    1454             :             elog(ERROR, "failed to enumerate network events: error code %u",
    1455             :                  WSAGetLastError());
    1456             :         if ((cur_event->events & WL_SOCKET_READABLE) &&
    1457             :             (resEvents.lNetworkEvents & FD_READ))
    1458             :         {
    1459             :             /* data available in socket */
    1460             :             occurred_events->events |= WL_SOCKET_READABLE;
    1461             : 
    1462             :             /*------
    1463             :              * WaitForMultipleObjects doesn't guarantee that a read event will
    1464             :              * be returned if the latch is set at the same time.  Even if it
    1465             :              * did, the caller might drop that event expecting it to reoccur
    1466             :              * on next call.  So, we must force the event to be reset if this
    1467             :              * WaitEventSet is used again in order to avoid an indefinite
    1468             :              * hang.  Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
    1469             :              * for the behavior of socket events.
    1470             :              *------
    1471             :              */
    1472             :             cur_event->reset = true;
    1473             :         }
    1474             :         if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
    1475             :             (resEvents.lNetworkEvents & FD_WRITE))
    1476             :         {
    1477             :             /* writeable */
    1478             :             occurred_events->events |= WL_SOCKET_WRITEABLE;
    1479             :         }
    1480             :         if ((cur_event->events & WL_SOCKET_CONNECTED) &&
    1481             :             (resEvents.lNetworkEvents & FD_CONNECT))
    1482             :         {
    1483             :             /* connected */
    1484             :             occurred_events->events |= WL_SOCKET_CONNECTED;
    1485             :         }
    1486             :         if (resEvents.lNetworkEvents & FD_CLOSE)
    1487             :         {
    1488             :             /* EOF/error, so signal all caller-requested socket flags */
    1489             :             occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
    1490             :         }
    1491             : 
    1492             :         if (occurred_events->events != 0)
    1493             :         {
    1494             :             occurred_events++;
    1495             :             returned_events++;
    1496             :         }
    1497             :     }
    1498             : 
    1499             :     return returned_events;
    1500             : }
    1501             : #endif
    1502             : 
    1503             : /*
    1504             :  * SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
    1505             :  *
    1506             :  * Wake up WaitLatch, if we're waiting.  (We might not be, since SIGUSR1 is
    1507             :  * overloaded for multiple purposes; or we might not have reached WaitLatch
    1508             :  * yet, in which case we don't need to fill the pipe either.)
    1509             :  *
    1510             :  * NB: when calling this in a signal handler, be sure to save and restore
    1511             :  * errno around it.
    1512             :  */
    1513             : #ifndef WIN32
    1514             : void
    1515      733258 : latch_sigusr1_handler(void)
    1516             : {
    1517      733258 :     if (waiting)
    1518      657260 :         sendSelfPipeByte();
    1519      733258 : }
    1520             : #endif                          /* !WIN32 */
    1521             : 
    1522             : /* Send one byte to the self-pipe, to wake up WaitLatch */
    1523             : #ifndef WIN32
    1524             : static void
    1525      668332 : sendSelfPipeByte(void)
    1526             : {
    1527             :     int         rc;
    1528      668332 :     char        dummy = 0;
    1529             : 
    1530             : retry:
    1531      668332 :     rc = write(selfpipe_writefd, &dummy, 1);
    1532      668332 :     if (rc < 0)
    1533             :     {
    1534             :         /* If interrupted by signal, just retry */
    1535           0 :         if (errno == EINTR)
    1536           0 :             goto retry;
    1537             : 
    1538             :         /*
    1539             :          * If the pipe is full, we don't need to retry, the data that's there
    1540             :          * already is enough to wake up WaitLatch.
    1541             :          */
    1542           0 :         if (errno == EAGAIN || errno == EWOULDBLOCK)
    1543           0 :             return;
    1544             : 
    1545             :         /*
    1546             :          * Oops, the write() failed for some other reason. We might be in a
    1547             :          * signal handler, so it's not safe to elog(). We have no choice but
    1548             :          * silently ignore the error.
    1549             :          */
    1550           0 :         return;
    1551             :     }
    1552             : }
    1553             : #endif                          /* !WIN32 */
    1554             : 
    1555             : /*
    1556             :  * Read all available data from the self-pipe
    1557             :  *
    1558             :  * Note: this is only called when waiting = true.  If it fails and doesn't
    1559             :  * return, it must reset that flag first (though ideally, this will never
    1560             :  * happen).
    1561             :  */
    1562             : #ifndef WIN32
    1563             : static void
    1564      657462 : drainSelfPipe(void)
    1565             : {
    1566             :     /*
    1567             :      * There shouldn't normally be more than one byte in the pipe, or maybe a
    1568             :      * few bytes if multiple processes run SetLatch at the same instant.
    1569             :      */
    1570             :     char        buf[16];
    1571             :     int         rc;
    1572             : 
    1573             :     for (;;)
    1574             :     {
    1575      657462 :         rc = read(selfpipe_readfd, buf, sizeof(buf));
    1576      657462 :         if (rc < 0)
    1577             :         {
    1578           0 :             if (errno == EAGAIN || errno == EWOULDBLOCK)
    1579             :                 break;          /* the pipe is empty */
    1580           0 :             else if (errno == EINTR)
    1581           0 :                 continue;       /* retry */
    1582             :             else
    1583             :             {
    1584           0 :                 waiting = false;
    1585           0 :                 elog(ERROR, "read() on self-pipe failed: %m");
    1586             :             }
    1587             :         }
    1588      657462 :         else if (rc == 0)
    1589             :         {
    1590           0 :             waiting = false;
    1591           0 :             elog(ERROR, "unexpected EOF on self-pipe");
    1592             :         }
    1593      657462 :         else if (rc < sizeof(buf))
    1594             :         {
    1595             :             /* we successfully drained the pipe; no need to read() again */
    1596      657462 :             break;
    1597             :         }
    1598             :         /* else buffer wasn't big enough, so read again */
    1599             :     }
    1600      657462 : }
    1601             : #endif                          /* !WIN32 */

Generated by: LCOV version 1.13