Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * latch.c
4 : * Routines for inter-process latches
5 : *
6 : * The poll() implementation uses the so-called self-pipe trick to overcome the
7 : * race condition involved with poll() and setting a global flag in the signal
8 : * handler. When a latch is set and the current process is waiting for it, the
9 : * signal handler wakes up the poll() in WaitLatch by writing a byte to a pipe.
10 : * A signal by itself doesn't interrupt poll() on all platforms, and even on
11 : * platforms where it does, a signal that arrives just before the poll() call
12 : * does not prevent poll() from entering sleep. An incoming byte on a pipe
13 : * however reliably interrupts the sleep, and causes poll() to return
14 : * immediately even if the signal arrives before poll() begins.
15 : *
16 : * The epoll() implementation overcomes the race with a different technique: it
17 : * keeps SIGURG blocked and consumes from a signalfd() descriptor instead. We
18 : * don't need to register a signal handler or create our own self-pipe. We
19 : * assume that any system that has Linux epoll() also has Linux signalfd().
20 : *
21 : * The kqueue() implementation waits for SIGURG with EVFILT_SIGNAL.
22 : *
23 : * The Windows implementation uses Windows events that are inherited by all
24 : * postmaster child processes. There's no need for the self-pipe trick there.
25 : *
26 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : * IDENTIFICATION
30 : * src/backend/storage/ipc/latch.c
31 : *
32 : *-------------------------------------------------------------------------
33 : */
34 : #include "postgres.h"
35 :
36 : #include <fcntl.h>
37 : #include <limits.h>
38 : #include <signal.h>
39 : #include <unistd.h>
40 : #ifdef HAVE_SYS_EPOLL_H
41 : #include <sys/epoll.h>
42 : #endif
43 : #ifdef HAVE_SYS_EVENT_H
44 : #include <sys/event.h>
45 : #endif
46 : #ifdef HAVE_SYS_SIGNALFD_H
47 : #include <sys/signalfd.h>
48 : #endif
49 : #ifdef HAVE_POLL_H
50 : #include <poll.h>
51 : #endif
52 :
53 : #include "libpq/pqsignal.h"
54 : #include "miscadmin.h"
55 : #include "pgstat.h"
56 : #include "port/atomics.h"
57 : #include "portability/instr_time.h"
58 : #include "postmaster/postmaster.h"
59 : #include "storage/fd.h"
60 : #include "storage/ipc.h"
61 : #include "storage/latch.h"
62 : #include "storage/pmsignal.h"
63 : #include "utils/memutils.h"
64 : #include "utils/resowner.h"
65 :
66 : /*
67 : * Select the fd readiness primitive to use. Normally the "most modern"
68 : * primitive supported by the OS will be used, but for testing it can be
69 : * useful to manually specify the used primitive. If desired, just add a
70 : * define somewhere before this block.
71 : */
72 : #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
73 : defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
74 : /* don't overwrite manual choice */
75 : #elif defined(HAVE_SYS_EPOLL_H)
76 : #define WAIT_USE_EPOLL
77 : #elif defined(HAVE_KQUEUE)
78 : #define WAIT_USE_KQUEUE
79 : #elif defined(HAVE_POLL)
80 : #define WAIT_USE_POLL
81 : #elif WIN32
82 : #define WAIT_USE_WIN32
83 : #else
84 : #error "no wait set implementation available"
85 : #endif
86 :
87 : /*
88 : * By default, we use a self-pipe with poll() and a signalfd with epoll(), if
89 : * available. For testing the choice can also be manually specified.
90 : */
91 : #if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL)
92 : #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
93 : /* don't overwrite manual choice */
94 : #elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H)
95 : #define WAIT_USE_SIGNALFD
96 : #else
97 : #define WAIT_USE_SELF_PIPE
98 : #endif
99 : #endif
100 :
101 : /* typedef in latch.h */
102 : struct WaitEventSet
103 : {
104 : ResourceOwner owner;
105 :
106 : int nevents; /* number of registered events */
107 : int nevents_space; /* maximum number of events in this set */
108 :
109 : /*
110 : * Array, of nevents_space length, storing the definition of events this
111 : * set is waiting for.
112 : */
113 : WaitEvent *events;
114 :
115 : /*
116 : * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
117 : * said latch, and latch_pos the offset in the ->events array. This is
118 : * useful because we check the state of the latch before performing doing
119 : * syscalls related to waiting.
120 : */
121 : Latch *latch;
122 : int latch_pos;
123 :
124 : /*
125 : * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
126 : * is set so that we'll exit immediately if postmaster death is detected,
127 : * instead of returning.
128 : */
129 : bool exit_on_postmaster_death;
130 :
131 : #if defined(WAIT_USE_EPOLL)
132 : int epoll_fd;
133 : /* epoll_wait returns events in a user provided arrays, allocate once */
134 : struct epoll_event *epoll_ret_events;
135 : #elif defined(WAIT_USE_KQUEUE)
136 : int kqueue_fd;
137 : /* kevent returns events in a user provided arrays, allocate once */
138 : struct kevent *kqueue_ret_events;
139 : bool report_postmaster_not_running;
140 : #elif defined(WAIT_USE_POLL)
141 : /* poll expects events to be waited on every poll() call, prepare once */
142 : struct pollfd *pollfds;
143 : #elif defined(WAIT_USE_WIN32)
144 :
145 : /*
146 : * Array of windows events. The first element always contains
147 : * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
148 : * event->pos + 1).
149 : */
150 : HANDLE *handles;
151 : #endif
152 : };
153 :
154 : /* A common WaitEventSet used to implement WaitLatch() */
155 : static WaitEventSet *LatchWaitSet;
156 :
157 : /* The position of the latch in LatchWaitSet. */
158 : #define LatchWaitSetLatchPos 0
159 :
160 : #ifndef WIN32
161 : /* Are we currently in WaitLatch? The signal handler would like to know. */
162 : static volatile sig_atomic_t waiting = false;
163 : #endif
164 :
165 : #ifdef WAIT_USE_SIGNALFD
166 : /* On Linux, we'll receive SIGURG via a signalfd file descriptor. */
167 : static int signal_fd = -1;
168 : #endif
169 :
170 : #ifdef WAIT_USE_SELF_PIPE
171 : /* Read and write ends of the self-pipe */
172 : static int selfpipe_readfd = -1;
173 : static int selfpipe_writefd = -1;
174 :
175 : /* Process owning the self-pipe --- needed for checking purposes */
176 : static int selfpipe_owner_pid = 0;
177 :
178 : /* Private function prototypes */
179 : static void latch_sigurg_handler(SIGNAL_ARGS);
180 : static void sendSelfPipeByte(void);
181 : #endif
182 :
183 : #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
184 : static void drain(void);
185 : #endif
186 :
187 : #if defined(WAIT_USE_EPOLL)
188 : static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
189 : #elif defined(WAIT_USE_KQUEUE)
190 : static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
191 : #elif defined(WAIT_USE_POLL)
192 : static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
193 : #elif defined(WAIT_USE_WIN32)
194 : static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
195 : #endif
196 :
197 : static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
198 : WaitEvent *occurred_events, int nevents);
199 :
200 : /* ResourceOwner support to hold WaitEventSets */
201 : static void ResOwnerReleaseWaitEventSet(Datum res);
202 :
203 : static const ResourceOwnerDesc wait_event_set_resowner_desc =
204 : {
205 : .name = "WaitEventSet",
206 : .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
207 : .release_priority = RELEASE_PRIO_WAITEVENTSETS,
208 : .ReleaseResource = ResOwnerReleaseWaitEventSet,
209 : .DebugPrint = NULL
210 : };
211 :
212 : /* Convenience wrappers over ResourceOwnerRemember/Forget */
213 : static inline void
214 79376 : ResourceOwnerRememberWaitEventSet(ResourceOwner owner, WaitEventSet *set)
215 : {
216 79376 : ResourceOwnerRemember(owner, PointerGetDatum(set), &wait_event_set_resowner_desc);
217 79376 : }
218 : static inline void
219 79374 : ResourceOwnerForgetWaitEventSet(ResourceOwner owner, WaitEventSet *set)
220 : {
221 79374 : ResourceOwnerForget(owner, PointerGetDatum(set), &wait_event_set_resowner_desc);
222 79374 : }
223 :
224 :
225 : /*
226 : * Initialize the process-local latch infrastructure.
227 : *
228 : * This must be called once during startup of any process that can wait on
229 : * latches, before it issues any InitLatch() or OwnLatch() calls.
230 : */
231 : void
232 34948 : InitializeLatchSupport(void)
233 : {
234 : #if defined(WAIT_USE_SELF_PIPE)
235 : int pipefd[2];
236 :
237 : if (IsUnderPostmaster)
238 : {
239 : /*
240 : * We might have inherited connections to a self-pipe created by the
241 : * postmaster. It's critical that child processes create their own
242 : * self-pipes, of course, and we really want them to close the
243 : * inherited FDs for safety's sake.
244 : */
245 : if (selfpipe_owner_pid != 0)
246 : {
247 : /* Assert we go through here but once in a child process */
248 : Assert(selfpipe_owner_pid != MyProcPid);
249 : /* Release postmaster's pipe FDs; ignore any error */
250 : (void) close(selfpipe_readfd);
251 : (void) close(selfpipe_writefd);
252 : /* Clean up, just for safety's sake; we'll set these below */
253 : selfpipe_readfd = selfpipe_writefd = -1;
254 : selfpipe_owner_pid = 0;
255 : /* Keep fd.c's accounting straight */
256 : ReleaseExternalFD();
257 : ReleaseExternalFD();
258 : }
259 : else
260 : {
261 : /*
262 : * Postmaster didn't create a self-pipe ... or else we're in an
263 : * EXEC_BACKEND build, in which case it doesn't matter since the
264 : * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
265 : * fd.c won't have state to clean up, either.
266 : */
267 : Assert(selfpipe_readfd == -1);
268 : }
269 : }
270 : else
271 : {
272 : /* In postmaster or standalone backend, assert we do this but once */
273 : Assert(selfpipe_readfd == -1);
274 : Assert(selfpipe_owner_pid == 0);
275 : }
276 :
277 : /*
278 : * Set up the self-pipe that allows a signal handler to wake up the
279 : * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
280 : * that SetLatch won't block if the event has already been set many times
281 : * filling the kernel buffer. Make the read-end non-blocking too, so that
282 : * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
283 : * Also, make both FDs close-on-exec, since we surely do not want any
284 : * child processes messing with them.
285 : */
286 : if (pipe(pipefd) < 0)
287 : elog(FATAL, "pipe() failed: %m");
288 : if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
289 : elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
290 : if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
291 : elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
292 : if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
293 : elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
294 : if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
295 : elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
296 :
297 : selfpipe_readfd = pipefd[0];
298 : selfpipe_writefd = pipefd[1];
299 : selfpipe_owner_pid = MyProcPid;
300 :
301 : /* Tell fd.c about these two long-lived FDs */
302 : ReserveExternalFD();
303 : ReserveExternalFD();
304 :
305 : pqsignal(SIGURG, latch_sigurg_handler);
306 : #endif
307 :
308 : #ifdef WAIT_USE_SIGNALFD
309 : sigset_t signalfd_mask;
310 :
311 34948 : if (IsUnderPostmaster)
312 : {
313 : /*
314 : * It would probably be safe to re-use the inherited signalfd since
315 : * signalfds only see the current process's pending signals, but it
316 : * seems less surprising to close it and create our own.
317 : */
318 32982 : if (signal_fd != -1)
319 : {
320 : /* Release postmaster's signal FD; ignore any error */
321 32982 : (void) close(signal_fd);
322 32982 : signal_fd = -1;
323 32982 : ReleaseExternalFD();
324 : }
325 : }
326 :
327 : /* Block SIGURG, because we'll receive it through a signalfd. */
328 34948 : sigaddset(&UnBlockSig, SIGURG);
329 :
330 : /* Set up the signalfd to receive SIGURG notifications. */
331 34948 : sigemptyset(&signalfd_mask);
332 34948 : sigaddset(&signalfd_mask, SIGURG);
333 34948 : signal_fd = signalfd(-1, &signalfd_mask, SFD_NONBLOCK | SFD_CLOEXEC);
334 34948 : if (signal_fd < 0)
335 0 : elog(FATAL, "signalfd() failed");
336 34948 : ReserveExternalFD();
337 : #endif
338 :
339 : #ifdef WAIT_USE_KQUEUE
340 : /* Ignore SIGURG, because we'll receive it via kqueue. */
341 : pqsignal(SIGURG, SIG_IGN);
342 : #endif
343 34948 : }
344 :
345 : void
346 33414 : InitializeLatchWaitSet(void)
347 : {
348 : int latch_pos PG_USED_FOR_ASSERTS_ONLY;
349 :
350 : Assert(LatchWaitSet == NULL);
351 :
352 : /* Set up the WaitEventSet used by WaitLatch(). */
353 33414 : LatchWaitSet = CreateWaitEventSet(NULL, 2);
354 33414 : latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
355 : MyLatch, NULL);
356 33414 : if (IsUnderPostmaster)
357 32982 : AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH,
358 : PGINVALID_SOCKET, NULL, NULL);
359 :
360 : Assert(latch_pos == LatchWaitSetLatchPos);
361 33414 : }
362 :
363 : void
364 0 : ShutdownLatchSupport(void)
365 : {
366 : #if defined(WAIT_USE_POLL)
367 : pqsignal(SIGURG, SIG_IGN);
368 : #endif
369 :
370 0 : if (LatchWaitSet)
371 : {
372 0 : FreeWaitEventSet(LatchWaitSet);
373 0 : LatchWaitSet = NULL;
374 : }
375 :
376 : #if defined(WAIT_USE_SELF_PIPE)
377 : close(selfpipe_readfd);
378 : close(selfpipe_writefd);
379 : selfpipe_readfd = -1;
380 : selfpipe_writefd = -1;
381 : selfpipe_owner_pid = InvalidPid;
382 : #endif
383 :
384 : #if defined(WAIT_USE_SIGNALFD)
385 0 : close(signal_fd);
386 0 : signal_fd = -1;
387 : #endif
388 0 : }
389 :
390 : /*
391 : * Initialize a process-local latch.
392 : */
393 : void
394 34948 : InitLatch(Latch *latch)
395 : {
396 34948 : latch->is_set = false;
397 34948 : latch->maybe_sleeping = false;
398 34948 : latch->owner_pid = MyProcPid;
399 34948 : latch->is_shared = false;
400 :
401 : #if defined(WAIT_USE_SELF_PIPE)
402 : /* Assert InitializeLatchSupport has been called in this process */
403 : Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
404 : #elif defined(WAIT_USE_SIGNALFD)
405 : /* Assert InitializeLatchSupport has been called in this process */
406 : Assert(signal_fd >= 0);
407 : #elif defined(WAIT_USE_WIN32)
408 : latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
409 : if (latch->event == NULL)
410 : elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
411 : #endif /* WIN32 */
412 34948 : }
413 :
414 : /*
415 : * Initialize a shared latch that can be set from other processes. The latch
416 : * is initially owned by no-one; use OwnLatch to associate it with the
417 : * current process.
418 : *
419 : * InitSharedLatch needs to be called in postmaster before forking child
420 : * processes, usually right after allocating the shared memory block
421 : * containing the latch with ShmemInitStruct. (The Unix implementation
422 : * doesn't actually require that, but the Windows one does.) Because of
423 : * this restriction, we have no concurrency issues to worry about here.
424 : *
425 : * Note that other handles created in this module are never marked as
426 : * inheritable. Thus we do not need to worry about cleaning up child
427 : * process references to postmaster-private latches or WaitEventSets.
428 : */
429 : void
430 171440 : InitSharedLatch(Latch *latch)
431 : {
432 : #ifdef WIN32
433 : SECURITY_ATTRIBUTES sa;
434 :
435 : /*
436 : * Set up security attributes to specify that the events are inherited.
437 : */
438 : ZeroMemory(&sa, sizeof(sa));
439 : sa.nLength = sizeof(sa);
440 : sa.bInheritHandle = TRUE;
441 :
442 : latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
443 : if (latch->event == NULL)
444 : elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
445 : #endif
446 :
447 171440 : latch->is_set = false;
448 171440 : latch->maybe_sleeping = false;
449 171440 : latch->owner_pid = 0;
450 171440 : latch->is_shared = true;
451 171440 : }
452 :
453 : /*
454 : * Associate a shared latch with the current process, allowing it to
455 : * wait on the latch.
456 : *
457 : * Although there is a sanity check for latch-already-owned, we don't do
458 : * any sort of locking here, meaning that we could fail to detect the error
459 : * if two processes try to own the same latch at about the same time. If
460 : * there is any risk of that, caller must provide an interlock to prevent it.
461 : */
462 : void
463 33060 : OwnLatch(Latch *latch)
464 : {
465 : int owner_pid;
466 :
467 : /* Sanity checks */
468 : Assert(latch->is_shared);
469 :
470 : #if defined(WAIT_USE_SELF_PIPE)
471 : /* Assert InitializeLatchSupport has been called in this process */
472 : Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
473 : #elif defined(WAIT_USE_SIGNALFD)
474 : /* Assert InitializeLatchSupport has been called in this process */
475 : Assert(signal_fd >= 0);
476 : #endif
477 :
478 33060 : owner_pid = latch->owner_pid;
479 33060 : if (owner_pid != 0)
480 0 : elog(PANIC, "latch already owned by PID %d", owner_pid);
481 :
482 33060 : latch->owner_pid = MyProcPid;
483 33060 : }
484 :
485 : /*
486 : * Disown a shared latch currently owned by the current process.
487 : */
488 : void
489 32954 : DisownLatch(Latch *latch)
490 : {
491 : Assert(latch->is_shared);
492 : Assert(latch->owner_pid == MyProcPid);
493 :
494 32954 : latch->owner_pid = 0;
495 32954 : }
496 :
497 : /*
498 : * Wait for a given latch to be set, or for postmaster death, or until timeout
499 : * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
500 : * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
501 : * function returns immediately.
502 : *
503 : * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
504 : * is given. Although it is declared as "long", we don't actually support
505 : * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
506 : * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
507 : *
508 : * The latch must be owned by the current process, ie. it must be a
509 : * process-local latch initialized with InitLatch, or a shared latch
510 : * associated with the current process by calling OwnLatch.
511 : *
512 : * Returns bit mask indicating which condition(s) caused the wake-up. Note
513 : * that if multiple wake-up conditions are true, there is no guarantee that
514 : * we return all of them in one call, but we will return at least one.
515 : */
516 : int
517 585662 : WaitLatch(Latch *latch, int wakeEvents, long timeout,
518 : uint32 wait_event_info)
519 : {
520 : WaitEvent event;
521 :
522 : /* Postmaster-managed callers must handle postmaster death somehow. */
523 : Assert(!IsUnderPostmaster ||
524 : (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
525 : (wakeEvents & WL_POSTMASTER_DEATH));
526 :
527 : /*
528 : * Some callers may have a latch other than MyLatch, or no latch at all,
529 : * or want to handle postmaster death differently. It's cheap to assign
530 : * those, so just do it every time.
531 : */
532 585662 : if (!(wakeEvents & WL_LATCH_SET))
533 144 : latch = NULL;
534 585662 : ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
535 585662 : LatchWaitSet->exit_on_postmaster_death =
536 585662 : ((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
537 :
538 585662 : if (WaitEventSetWait(LatchWaitSet,
539 585662 : (wakeEvents & WL_TIMEOUT) ? timeout : -1,
540 : &event, 1,
541 : wait_event_info) == 0)
542 41896 : return WL_TIMEOUT;
543 : else
544 543748 : return event.events;
545 : }
546 :
547 : /*
548 : * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
549 : * conditions.
550 : *
551 : * When waiting on a socket, EOF and error conditions always cause the socket
552 : * to be reported as readable/writable/connected, so that the caller can deal
553 : * with the condition.
554 : *
555 : * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
556 : * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
557 : * return value if the postmaster dies. The latter is useful for rare cases
558 : * where some behavior other than immediate exit is needed.
559 : *
560 : * NB: These days this is just a wrapper around the WaitEventSet API. When
561 : * using a latch very frequently, consider creating a longer living
562 : * WaitEventSet instead; that's more efficient.
563 : */
564 : int
565 110782 : WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
566 : long timeout, uint32 wait_event_info)
567 : {
568 110782 : int ret = 0;
569 : int rc;
570 : WaitEvent event;
571 110782 : WaitEventSet *set = CreateWaitEventSet(CurrentResourceOwner, 3);
572 :
573 110782 : if (wakeEvents & WL_TIMEOUT)
574 : Assert(timeout >= 0);
575 : else
576 26996 : timeout = -1;
577 :
578 110782 : if (wakeEvents & WL_LATCH_SET)
579 110484 : AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
580 : latch, NULL);
581 :
582 : /* Postmaster-managed callers must handle postmaster death somehow. */
583 : Assert(!IsUnderPostmaster ||
584 : (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
585 : (wakeEvents & WL_POSTMASTER_DEATH));
586 :
587 110782 : if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
588 0 : AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
589 : NULL, NULL);
590 :
591 110782 : if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
592 110782 : AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
593 : NULL, NULL);
594 :
595 110782 : if (wakeEvents & WL_SOCKET_MASK)
596 : {
597 : int ev;
598 :
599 110782 : ev = wakeEvents & WL_SOCKET_MASK;
600 110782 : AddWaitEventToSet(set, ev, sock, NULL, NULL);
601 : }
602 :
603 110782 : rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
604 :
605 110782 : if (rc == 0)
606 242 : ret |= WL_TIMEOUT;
607 : else
608 : {
609 110540 : ret |= event.events & (WL_LATCH_SET |
610 : WL_POSTMASTER_DEATH |
611 : WL_SOCKET_MASK);
612 : }
613 :
614 110782 : FreeWaitEventSet(set);
615 :
616 110782 : return ret;
617 : }
618 :
619 : /*
620 : * Sets a latch and wakes up anyone waiting on it.
621 : *
622 : * This is cheap if the latch is already set, otherwise not so much.
623 : *
624 : * NB: when calling this in a signal handler, be sure to save and restore
625 : * errno around it. (That's standard practice in most signal handlers, of
626 : * course, but we used to omit it in handlers that only set a flag.)
627 : *
628 : * NB: this function is called from critical sections and signal handlers so
629 : * throwing an error is not a good idea.
630 : */
631 : void
632 1305456 : SetLatch(Latch *latch)
633 : {
634 : #ifndef WIN32
635 : pid_t owner_pid;
636 : #else
637 : HANDLE handle;
638 : #endif
639 :
640 : /*
641 : * The memory barrier has to be placed here to ensure that any flag
642 : * variables possibly changed by this process have been flushed to main
643 : * memory, before we check/set is_set.
644 : */
645 1305456 : pg_memory_barrier();
646 :
647 : /* Quick exit if already set */
648 1305456 : if (latch->is_set)
649 317470 : return;
650 :
651 987986 : latch->is_set = true;
652 :
653 987986 : pg_memory_barrier();
654 987986 : if (!latch->maybe_sleeping)
655 105886 : return;
656 :
657 : #ifndef WIN32
658 :
659 : /*
660 : * See if anyone's waiting for the latch. It can be the current process if
661 : * we're in a signal handler. We use the self-pipe or SIGURG to ourselves
662 : * to wake up WaitEventSetWaitBlock() without races in that case. If it's
663 : * another process, send a signal.
664 : *
665 : * Fetch owner_pid only once, in case the latch is concurrently getting
666 : * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
667 : * guaranteed to be true! In practice, the effective range of pid_t fits
668 : * in a 32 bit integer, and so should be atomic. In the worst case, we
669 : * might end up signaling the wrong process. Even then, you're very
670 : * unlucky if a process with that bogus pid exists and belongs to
671 : * Postgres; and PG database processes should handle excess SIGUSR1
672 : * interrupts without a problem anyhow.
673 : *
674 : * Another sort of race condition that's possible here is for a new
675 : * process to own the latch immediately after we look, so we don't signal
676 : * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
677 : * the standard coding convention of waiting at the bottom of their loops,
678 : * not the top, so that they'll correctly process latch-setting events
679 : * that happen before they enter the loop.
680 : */
681 882100 : owner_pid = latch->owner_pid;
682 882100 : if (owner_pid == 0)
683 0 : return;
684 882100 : else if (owner_pid == MyProcPid)
685 : {
686 : #if defined(WAIT_USE_SELF_PIPE)
687 : if (waiting)
688 : sendSelfPipeByte();
689 : #else
690 338398 : if (waiting)
691 338398 : kill(MyProcPid, SIGURG);
692 : #endif
693 : }
694 : else
695 543702 : kill(owner_pid, SIGURG);
696 :
697 : #else
698 :
699 : /*
700 : * See if anyone's waiting for the latch. It can be the current process if
701 : * we're in a signal handler.
702 : *
703 : * Use a local variable here just in case somebody changes the event field
704 : * concurrently (which really should not happen).
705 : */
706 : handle = latch->event;
707 : if (handle)
708 : {
709 : SetEvent(handle);
710 :
711 : /*
712 : * Note that we silently ignore any errors. We might be in a signal
713 : * handler or other critical path where it's not safe to call elog().
714 : */
715 : }
716 : #endif
717 : }
718 :
719 : /*
720 : * Clear the latch. Calling WaitLatch after this will sleep, unless
721 : * the latch is set again before the WaitLatch call.
722 : */
723 : void
724 3464530 : ResetLatch(Latch *latch)
725 : {
726 : /* Only the owner should reset the latch */
727 : Assert(latch->owner_pid == MyProcPid);
728 : Assert(latch->maybe_sleeping == false);
729 :
730 3464530 : latch->is_set = false;
731 :
732 : /*
733 : * Ensure that the write to is_set gets flushed to main memory before we
734 : * examine any flag variables. Otherwise a concurrent SetLatch might
735 : * falsely conclude that it needn't signal us, even though we have missed
736 : * seeing some flag updates that SetLatch was supposed to inform us of.
737 : */
738 3464530 : pg_memory_barrier();
739 3464530 : }
740 :
741 : /*
742 : * Create a WaitEventSet with space for nevents different events to wait for.
743 : *
744 : * These events can then be efficiently waited upon together, using
745 : * WaitEventSetWait().
746 : *
747 : * The WaitEventSet is tracked by the given 'resowner'. Use NULL for session
748 : * lifetime.
749 : */
750 : WaitEventSet *
751 169984 : CreateWaitEventSet(ResourceOwner resowner, int nevents)
752 : {
753 : WaitEventSet *set;
754 : char *data;
755 169984 : Size sz = 0;
756 :
757 : /*
758 : * Use MAXALIGN size/alignment to guarantee that later uses of memory are
759 : * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
760 : * platforms, but earlier allocations like WaitEventSet and WaitEvent
761 : * might not be sized to guarantee that when purely using sizeof().
762 : */
763 169984 : sz += MAXALIGN(sizeof(WaitEventSet));
764 169984 : sz += MAXALIGN(sizeof(WaitEvent) * nevents);
765 :
766 : #if defined(WAIT_USE_EPOLL)
767 169984 : sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
768 : #elif defined(WAIT_USE_KQUEUE)
769 : sz += MAXALIGN(sizeof(struct kevent) * nevents);
770 : #elif defined(WAIT_USE_POLL)
771 : sz += MAXALIGN(sizeof(struct pollfd) * nevents);
772 : #elif defined(WAIT_USE_WIN32)
773 : /* need space for the pgwin32_signal_event */
774 : sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
775 : #endif
776 :
777 169984 : if (resowner != NULL)
778 79376 : ResourceOwnerEnlarge(resowner);
779 :
780 169984 : data = (char *) MemoryContextAllocZero(TopMemoryContext, sz);
781 :
782 169984 : set = (WaitEventSet *) data;
783 169984 : data += MAXALIGN(sizeof(WaitEventSet));
784 :
785 169984 : set->events = (WaitEvent *) data;
786 169984 : data += MAXALIGN(sizeof(WaitEvent) * nevents);
787 :
788 : #if defined(WAIT_USE_EPOLL)
789 169984 : set->epoll_ret_events = (struct epoll_event *) data;
790 169984 : data += MAXALIGN(sizeof(struct epoll_event) * nevents);
791 : #elif defined(WAIT_USE_KQUEUE)
792 : set->kqueue_ret_events = (struct kevent *) data;
793 : data += MAXALIGN(sizeof(struct kevent) * nevents);
794 : #elif defined(WAIT_USE_POLL)
795 : set->pollfds = (struct pollfd *) data;
796 : data += MAXALIGN(sizeof(struct pollfd) * nevents);
797 : #elif defined(WAIT_USE_WIN32)
798 : set->handles = (HANDLE) data;
799 : data += MAXALIGN(sizeof(HANDLE) * nevents);
800 : #endif
801 :
802 169984 : set->latch = NULL;
803 169984 : set->nevents_space = nevents;
804 169984 : set->exit_on_postmaster_death = false;
805 :
806 169984 : if (resowner != NULL)
807 : {
808 79376 : ResourceOwnerRememberWaitEventSet(resowner, set);
809 79376 : set->owner = resowner;
810 : }
811 :
812 : #if defined(WAIT_USE_EPOLL)
813 169984 : if (!AcquireExternalFD())
814 : {
815 : /* treat this as though epoll_create1 itself returned EMFILE */
816 0 : elog(ERROR, "epoll_create1 failed: %m");
817 : }
818 169984 : set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
819 169984 : if (set->epoll_fd < 0)
820 : {
821 0 : ReleaseExternalFD();
822 0 : elog(ERROR, "epoll_create1 failed: %m");
823 : }
824 : #elif defined(WAIT_USE_KQUEUE)
825 : if (!AcquireExternalFD())
826 : {
827 : /* treat this as though kqueue itself returned EMFILE */
828 : elog(ERROR, "kqueue failed: %m");
829 : }
830 : set->kqueue_fd = kqueue();
831 : if (set->kqueue_fd < 0)
832 : {
833 : ReleaseExternalFD();
834 : elog(ERROR, "kqueue failed: %m");
835 : }
836 : if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
837 : {
838 : int save_errno = errno;
839 :
840 : close(set->kqueue_fd);
841 : ReleaseExternalFD();
842 : errno = save_errno;
843 : elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
844 : }
845 : set->report_postmaster_not_running = false;
846 : #elif defined(WAIT_USE_WIN32)
847 :
848 : /*
849 : * To handle signals while waiting, we need to add a win32 specific event.
850 : * We accounted for the additional event at the top of this routine. See
851 : * port/win32/signal.c for more details.
852 : *
853 : * Note: pgwin32_signal_event should be first to ensure that it will be
854 : * reported when multiple events are set. We want to guarantee that
855 : * pending signals are serviced.
856 : */
857 : set->handles[0] = pgwin32_signal_event;
858 : StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
859 : #endif
860 :
861 169984 : return set;
862 : }
863 :
864 : /*
865 : * Free a previously created WaitEventSet.
866 : *
867 : * Note: preferably, this shouldn't have to free any resources that could be
868 : * inherited across an exec(). If it did, we'd likely leak those resources in
869 : * many scenarios. For the epoll case, we ensure that by setting EPOLL_CLOEXEC
870 : * when the FD is created. For the Windows case, we assume that the handles
871 : * involved are non-inheritable.
872 : */
873 : void
874 112516 : FreeWaitEventSet(WaitEventSet *set)
875 : {
876 112516 : if (set->owner)
877 : {
878 79374 : ResourceOwnerForgetWaitEventSet(set->owner, set);
879 79374 : set->owner = NULL;
880 : }
881 :
882 : #if defined(WAIT_USE_EPOLL)
883 112516 : close(set->epoll_fd);
884 112516 : ReleaseExternalFD();
885 : #elif defined(WAIT_USE_KQUEUE)
886 : close(set->kqueue_fd);
887 : ReleaseExternalFD();
888 : #elif defined(WAIT_USE_WIN32)
889 : for (WaitEvent *cur_event = set->events;
890 : cur_event < (set->events + set->nevents);
891 : cur_event++)
892 : {
893 : if (cur_event->events & WL_LATCH_SET)
894 : {
895 : /* uses the latch's HANDLE */
896 : }
897 : else if (cur_event->events & WL_POSTMASTER_DEATH)
898 : {
899 : /* uses PostmasterHandle */
900 : }
901 : else
902 : {
903 : /* Clean up the event object we created for the socket */
904 : WSAEventSelect(cur_event->fd, NULL, 0);
905 : WSACloseEvent(set->handles[cur_event->pos + 1]);
906 : }
907 : }
908 : #endif
909 :
910 112516 : pfree(set);
911 112516 : }
912 :
913 : /*
914 : * Free a previously created WaitEventSet in a child process after a fork().
915 : */
916 : void
917 29774 : FreeWaitEventSetAfterFork(WaitEventSet *set)
918 : {
919 : #if defined(WAIT_USE_EPOLL)
920 29774 : close(set->epoll_fd);
921 29774 : ReleaseExternalFD();
922 : #elif defined(WAIT_USE_KQUEUE)
923 : /* kqueues are not normally inherited by child processes */
924 : ReleaseExternalFD();
925 : #endif
926 :
927 29774 : pfree(set);
928 29774 : }
929 :
930 : /* ---
931 : * Add an event to the set. Possible events are:
932 : * - WL_LATCH_SET: Wait for the latch to be set
933 : * - WL_POSTMASTER_DEATH: Wait for postmaster to die
934 : * - WL_SOCKET_READABLE: Wait for socket to become readable,
935 : * can be combined in one event with other WL_SOCKET_* events
936 : * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
937 : * can be combined with other WL_SOCKET_* events
938 : * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
939 : * can be combined with other WL_SOCKET_* events (on non-Windows
940 : * platforms, this is the same as WL_SOCKET_WRITEABLE)
941 : * - WL_SOCKET_ACCEPT: Wait for new connection to a server socket,
942 : * can be combined with other WL_SOCKET_* events (on non-Windows
943 : * platforms, this is the same as WL_SOCKET_READABLE)
944 : * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
945 : * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
946 : *
947 : * Returns the offset in WaitEventSet->events (starting from 0), which can be
948 : * used to modify previously added wait events using ModifyWaitEvent().
949 : *
950 : * In the WL_LATCH_SET case the latch must be owned by the current process,
951 : * i.e. it must be a process-local latch initialized with InitLatch, or a
952 : * shared latch associated with the current process by calling OwnLatch.
953 : *
954 : * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED/ACCEPT cases, EOF and error
955 : * conditions cause the socket to be reported as readable/writable/connected,
956 : * so that the caller can deal with the condition.
957 : *
958 : * The user_data pointer specified here will be set for the events returned
959 : * by WaitEventSetWait(), allowing to easily associate additional data with
960 : * events.
961 : */
962 : int
963 471222 : AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
964 : void *user_data)
965 : {
966 : WaitEvent *event;
967 :
968 : /* not enough space */
969 : Assert(set->nevents < set->nevents_space);
970 :
971 471222 : if (events == WL_EXIT_ON_PM_DEATH)
972 : {
973 143970 : events = WL_POSTMASTER_DEATH;
974 143970 : set->exit_on_postmaster_death = true;
975 : }
976 :
977 471222 : if (latch)
978 : {
979 169480 : if (latch->owner_pid != MyProcPid)
980 0 : elog(ERROR, "cannot wait on a latch owned by another process");
981 169480 : if (set->latch)
982 0 : elog(ERROR, "cannot wait on more than one latch");
983 169480 : if ((events & WL_LATCH_SET) != WL_LATCH_SET)
984 0 : elog(ERROR, "latch events only support being set");
985 : }
986 : else
987 : {
988 301742 : if (events & WL_LATCH_SET)
989 0 : elog(ERROR, "cannot wait on latch without a specified latch");
990 : }
991 :
992 : /* waiting for socket readiness without a socket indicates a bug */
993 471222 : if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
994 0 : elog(ERROR, "cannot wait on socket event without a socket");
995 :
996 471222 : event = &set->events[set->nevents];
997 471222 : event->pos = set->nevents++;
998 471222 : event->fd = fd;
999 471222 : event->events = events;
1000 471222 : event->user_data = user_data;
1001 : #ifdef WIN32
1002 : event->reset = false;
1003 : #endif
1004 :
1005 471222 : if (events == WL_LATCH_SET)
1006 : {
1007 169480 : set->latch = latch;
1008 169480 : set->latch_pos = event->pos;
1009 : #if defined(WAIT_USE_SELF_PIPE)
1010 : event->fd = selfpipe_readfd;
1011 : #elif defined(WAIT_USE_SIGNALFD)
1012 169480 : event->fd = signal_fd;
1013 : #else
1014 : event->fd = PGINVALID_SOCKET;
1015 : #ifdef WAIT_USE_EPOLL
1016 : return event->pos;
1017 : #endif
1018 : #endif
1019 : }
1020 301742 : else if (events == WL_POSTMASTER_DEATH)
1021 : {
1022 : #ifndef WIN32
1023 166508 : event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
1024 : #endif
1025 : }
1026 :
1027 : /* perform wait primitive specific initialization, if needed */
1028 : #if defined(WAIT_USE_EPOLL)
1029 471222 : WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
1030 : #elif defined(WAIT_USE_KQUEUE)
1031 : WaitEventAdjustKqueue(set, event, 0);
1032 : #elif defined(WAIT_USE_POLL)
1033 : WaitEventAdjustPoll(set, event);
1034 : #elif defined(WAIT_USE_WIN32)
1035 : WaitEventAdjustWin32(set, event);
1036 : #endif
1037 :
1038 471222 : return event->pos;
1039 : }
1040 :
1041 : /*
1042 : * Change the event mask and, in the WL_LATCH_SET case, the latch associated
1043 : * with the WaitEvent. The latch may be changed to NULL to disable the latch
1044 : * temporarily, and then set back to a latch later.
1045 : *
1046 : * 'pos' is the id returned by AddWaitEventToSet.
1047 : */
1048 : void
1049 1044332 : ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
1050 : {
1051 : WaitEvent *event;
1052 : #if defined(WAIT_USE_KQUEUE)
1053 : int old_events;
1054 : #endif
1055 :
1056 : Assert(pos < set->nevents);
1057 :
1058 1044332 : event = &set->events[pos];
1059 : #if defined(WAIT_USE_KQUEUE)
1060 : old_events = event->events;
1061 : #endif
1062 :
1063 : /*
1064 : * If neither the event mask nor the associated latch changes, return
1065 : * early. That's an important optimization for some sockets, where
1066 : * ModifyWaitEvent is frequently used to switch from waiting for reads to
1067 : * waiting on writes.
1068 : */
1069 1044332 : if (events == event->events &&
1070 1024444 : (!(event->events & WL_LATCH_SET) || set->latch == latch))
1071 973028 : return;
1072 :
1073 71304 : if (event->events & WL_LATCH_SET &&
1074 51416 : events != event->events)
1075 : {
1076 0 : elog(ERROR, "cannot modify latch event");
1077 : }
1078 :
1079 71304 : if (event->events & WL_POSTMASTER_DEATH)
1080 : {
1081 0 : elog(ERROR, "cannot modify postmaster death event");
1082 : }
1083 :
1084 : /* FIXME: validate event mask */
1085 71304 : event->events = events;
1086 :
1087 71304 : if (events == WL_LATCH_SET)
1088 : {
1089 51416 : if (latch && latch->owner_pid != MyProcPid)
1090 0 : elog(ERROR, "cannot wait on a latch owned by another process");
1091 51416 : set->latch = latch;
1092 :
1093 : /*
1094 : * On Unix, we don't need to modify the kernel object because the
1095 : * underlying pipe (if there is one) is the same for all latches so we
1096 : * can return immediately. On Windows, we need to update our array of
1097 : * handles, but we leave the old one in place and tolerate spurious
1098 : * wakeups if the latch is disabled.
1099 : */
1100 : #if defined(WAIT_USE_WIN32)
1101 : if (!latch)
1102 : return;
1103 : #else
1104 51416 : return;
1105 : #endif
1106 : }
1107 :
1108 : #if defined(WAIT_USE_EPOLL)
1109 19888 : WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
1110 : #elif defined(WAIT_USE_KQUEUE)
1111 : WaitEventAdjustKqueue(set, event, old_events);
1112 : #elif defined(WAIT_USE_POLL)
1113 : WaitEventAdjustPoll(set, event);
1114 : #elif defined(WAIT_USE_WIN32)
1115 : WaitEventAdjustWin32(set, event);
1116 : #endif
1117 : }
1118 :
1119 : #if defined(WAIT_USE_EPOLL)
1120 : /*
1121 : * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
1122 : */
1123 : static void
1124 491110 : WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
1125 : {
1126 : struct epoll_event epoll_ev;
1127 : int rc;
1128 :
1129 : /* pointer to our event, returned by epoll_wait */
1130 491110 : epoll_ev.data.ptr = event;
1131 : /* always wait for errors */
1132 491110 : epoll_ev.events = EPOLLERR | EPOLLHUP;
1133 :
1134 : /* prepare pollfd entry once */
1135 491110 : if (event->events == WL_LATCH_SET)
1136 : {
1137 : Assert(set->latch != NULL);
1138 169480 : epoll_ev.events |= EPOLLIN;
1139 : }
1140 321630 : else if (event->events == WL_POSTMASTER_DEATH)
1141 : {
1142 166508 : epoll_ev.events |= EPOLLIN;
1143 : }
1144 : else
1145 : {
1146 : Assert(event->fd != PGINVALID_SOCKET);
1147 : Assert(event->events & (WL_SOCKET_READABLE |
1148 : WL_SOCKET_WRITEABLE |
1149 : WL_SOCKET_CLOSED));
1150 :
1151 155122 : if (event->events & WL_SOCKET_READABLE)
1152 129742 : epoll_ev.events |= EPOLLIN;
1153 155122 : if (event->events & WL_SOCKET_WRITEABLE)
1154 25868 : epoll_ev.events |= EPOLLOUT;
1155 155122 : if (event->events & WL_SOCKET_CLOSED)
1156 0 : epoll_ev.events |= EPOLLRDHUP;
1157 : }
1158 :
1159 : /*
1160 : * Even though unused, we also pass epoll_ev as the data argument if
1161 : * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
1162 : * requiring that, and actually it makes the code simpler...
1163 : */
1164 491110 : rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
1165 :
1166 491110 : if (rc < 0)
1167 0 : ereport(ERROR,
1168 : (errcode_for_socket_access(),
1169 : errmsg("%s() failed: %m",
1170 : "epoll_ctl")));
1171 491110 : }
1172 : #endif
1173 :
1174 : #if defined(WAIT_USE_POLL)
1175 : static void
1176 : WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
1177 : {
1178 : struct pollfd *pollfd = &set->pollfds[event->pos];
1179 :
1180 : pollfd->revents = 0;
1181 : pollfd->fd = event->fd;
1182 :
1183 : /* prepare pollfd entry once */
1184 : if (event->events == WL_LATCH_SET)
1185 : {
1186 : Assert(set->latch != NULL);
1187 : pollfd->events = POLLIN;
1188 : }
1189 : else if (event->events == WL_POSTMASTER_DEATH)
1190 : {
1191 : pollfd->events = POLLIN;
1192 : }
1193 : else
1194 : {
1195 : Assert(event->events & (WL_SOCKET_READABLE |
1196 : WL_SOCKET_WRITEABLE |
1197 : WL_SOCKET_CLOSED));
1198 : pollfd->events = 0;
1199 : if (event->events & WL_SOCKET_READABLE)
1200 : pollfd->events |= POLLIN;
1201 : if (event->events & WL_SOCKET_WRITEABLE)
1202 : pollfd->events |= POLLOUT;
1203 : #ifdef POLLRDHUP
1204 : if (event->events & WL_SOCKET_CLOSED)
1205 : pollfd->events |= POLLRDHUP;
1206 : #endif
1207 : }
1208 :
1209 : Assert(event->fd != PGINVALID_SOCKET);
1210 : }
1211 : #endif
1212 :
1213 : #if defined(WAIT_USE_KQUEUE)
1214 :
1215 : /*
1216 : * On most BSD family systems, the udata member of struct kevent is of type
1217 : * void *, so we could directly convert to/from WaitEvent *. Unfortunately,
1218 : * NetBSD has it as intptr_t, so here we wallpaper over that difference with
1219 : * an lvalue cast.
1220 : */
1221 : #define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
1222 :
1223 : static inline void
1224 : WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
1225 : WaitEvent *event)
1226 : {
1227 : k_ev->ident = event->fd;
1228 : k_ev->filter = filter;
1229 : k_ev->flags = action;
1230 : k_ev->fflags = 0;
1231 : k_ev->data = 0;
1232 : AccessWaitEvent(k_ev) = event;
1233 : }
1234 :
1235 : static inline void
1236 : WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
1237 : {
1238 : /* For now postmaster death can only be added, not removed. */
1239 : k_ev->ident = PostmasterPid;
1240 : k_ev->filter = EVFILT_PROC;
1241 : k_ev->flags = EV_ADD;
1242 : k_ev->fflags = NOTE_EXIT;
1243 : k_ev->data = 0;
1244 : AccessWaitEvent(k_ev) = event;
1245 : }
1246 :
1247 : static inline void
1248 : WaitEventAdjustKqueueAddLatch(struct kevent *k_ev, WaitEvent *event)
1249 : {
1250 : /* For now latch can only be added, not removed. */
1251 : k_ev->ident = SIGURG;
1252 : k_ev->filter = EVFILT_SIGNAL;
1253 : k_ev->flags = EV_ADD;
1254 : k_ev->fflags = 0;
1255 : k_ev->data = 0;
1256 : AccessWaitEvent(k_ev) = event;
1257 : }
1258 :
1259 : /*
1260 : * old_events is the previous event mask, used to compute what has changed.
1261 : */
1262 : static void
1263 : WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
1264 : {
1265 : int rc;
1266 : struct kevent k_ev[2];
1267 : int count = 0;
1268 : bool new_filt_read = false;
1269 : bool old_filt_read = false;
1270 : bool new_filt_write = false;
1271 : bool old_filt_write = false;
1272 :
1273 : if (old_events == event->events)
1274 : return;
1275 :
1276 : Assert(event->events != WL_LATCH_SET || set->latch != NULL);
1277 : Assert(event->events == WL_LATCH_SET ||
1278 : event->events == WL_POSTMASTER_DEATH ||
1279 : (event->events & (WL_SOCKET_READABLE |
1280 : WL_SOCKET_WRITEABLE |
1281 : WL_SOCKET_CLOSED)));
1282 :
1283 : if (event->events == WL_POSTMASTER_DEATH)
1284 : {
1285 : /*
1286 : * Unlike all the other implementations, we detect postmaster death
1287 : * using process notification instead of waiting on the postmaster
1288 : * alive pipe.
1289 : */
1290 : WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
1291 : }
1292 : else if (event->events == WL_LATCH_SET)
1293 : {
1294 : /* We detect latch wakeup using a signal event. */
1295 : WaitEventAdjustKqueueAddLatch(&k_ev[count++], event);
1296 : }
1297 : else
1298 : {
1299 : /*
1300 : * We need to compute the adds and deletes required to get from the
1301 : * old event mask to the new event mask, since kevent treats readable
1302 : * and writable as separate events.
1303 : */
1304 : if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
1305 : old_filt_read = true;
1306 : if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
1307 : new_filt_read = true;
1308 : if (old_events & WL_SOCKET_WRITEABLE)
1309 : old_filt_write = true;
1310 : if (event->events & WL_SOCKET_WRITEABLE)
1311 : new_filt_write = true;
1312 : if (old_filt_read && !new_filt_read)
1313 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
1314 : event);
1315 : else if (!old_filt_read && new_filt_read)
1316 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
1317 : event);
1318 : if (old_filt_write && !new_filt_write)
1319 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
1320 : event);
1321 : else if (!old_filt_write && new_filt_write)
1322 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
1323 : event);
1324 : }
1325 :
1326 : /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
1327 : if (count == 0)
1328 : return;
1329 :
1330 : Assert(count <= 2);
1331 :
1332 : rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
1333 :
1334 : /*
1335 : * When adding the postmaster's pid, we have to consider that it might
1336 : * already have exited and perhaps even been replaced by another process
1337 : * with the same pid. If so, we have to defer reporting this as an event
1338 : * until the next call to WaitEventSetWaitBlock().
1339 : */
1340 :
1341 : if (rc < 0)
1342 : {
1343 : if (event->events == WL_POSTMASTER_DEATH &&
1344 : (errno == ESRCH || errno == EACCES))
1345 : set->report_postmaster_not_running = true;
1346 : else
1347 : ereport(ERROR,
1348 : (errcode_for_socket_access(),
1349 : errmsg("%s() failed: %m",
1350 : "kevent")));
1351 : }
1352 : else if (event->events == WL_POSTMASTER_DEATH &&
1353 : PostmasterPid != getppid() &&
1354 : !PostmasterIsAlive())
1355 : {
1356 : /*
1357 : * The extra PostmasterIsAliveInternal() check prevents false alarms
1358 : * on systems that give a different value for getppid() while being
1359 : * traced by a debugger.
1360 : */
1361 : set->report_postmaster_not_running = true;
1362 : }
1363 : }
1364 :
1365 : #endif
1366 :
1367 : #if defined(WAIT_USE_WIN32)
1368 : static void
1369 : WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
1370 : {
1371 : HANDLE *handle = &set->handles[event->pos + 1];
1372 :
1373 : if (event->events == WL_LATCH_SET)
1374 : {
1375 : Assert(set->latch != NULL);
1376 : *handle = set->latch->event;
1377 : }
1378 : else if (event->events == WL_POSTMASTER_DEATH)
1379 : {
1380 : *handle = PostmasterHandle;
1381 : }
1382 : else
1383 : {
1384 : int flags = FD_CLOSE; /* always check for errors/EOF */
1385 :
1386 : if (event->events & WL_SOCKET_READABLE)
1387 : flags |= FD_READ;
1388 : if (event->events & WL_SOCKET_WRITEABLE)
1389 : flags |= FD_WRITE;
1390 : if (event->events & WL_SOCKET_CONNECTED)
1391 : flags |= FD_CONNECT;
1392 : if (event->events & WL_SOCKET_ACCEPT)
1393 : flags |= FD_ACCEPT;
1394 :
1395 : if (*handle == WSA_INVALID_EVENT)
1396 : {
1397 : *handle = WSACreateEvent();
1398 : if (*handle == WSA_INVALID_EVENT)
1399 : elog(ERROR, "failed to create event for socket: error code %d",
1400 : WSAGetLastError());
1401 : }
1402 : if (WSAEventSelect(event->fd, *handle, flags) != 0)
1403 : elog(ERROR, "failed to set up event for socket: error code %d",
1404 : WSAGetLastError());
1405 :
1406 : Assert(event->fd != PGINVALID_SOCKET);
1407 : }
1408 : }
1409 : #endif
1410 :
1411 : /*
1412 : * Wait for events added to the set to happen, or until the timeout is
1413 : * reached. At most nevents occurred events are returned.
1414 : *
1415 : * If timeout = -1, block until an event occurs; if 0, check sockets for
1416 : * readiness, but don't block; if > 0, block for at most timeout milliseconds.
1417 : *
1418 : * Returns the number of events occurred, or 0 if the timeout was reached.
1419 : *
1420 : * Returned events will have the fd, pos, user_data fields set to the
1421 : * values associated with the registered event.
1422 : */
1423 : int
1424 1462064 : WaitEventSetWait(WaitEventSet *set, long timeout,
1425 : WaitEvent *occurred_events, int nevents,
1426 : uint32 wait_event_info)
1427 : {
1428 1462064 : int returned_events = 0;
1429 : instr_time start_time;
1430 : instr_time cur_time;
1431 1462064 : long cur_timeout = -1;
1432 :
1433 : Assert(nevents > 0);
1434 :
1435 : /*
1436 : * Initialize timeout if requested. We must record the current time so
1437 : * that we can determine the remaining timeout if interrupted.
1438 : */
1439 1462064 : if (timeout >= 0)
1440 : {
1441 591046 : INSTR_TIME_SET_CURRENT(start_time);
1442 : Assert(timeout >= 0 && timeout <= INT_MAX);
1443 591046 : cur_timeout = timeout;
1444 : }
1445 : else
1446 871018 : INSTR_TIME_SET_ZERO(start_time);
1447 :
1448 1462064 : pgstat_report_wait_start(wait_event_info);
1449 :
1450 : #ifndef WIN32
1451 1462064 : waiting = true;
1452 : #else
1453 : /* Ensure that signals are serviced even if latch is already set */
1454 : pgwin32_dispatch_queued_signals();
1455 : #endif
1456 3152320 : while (returned_events == 0)
1457 : {
1458 : int rc;
1459 :
1460 : /*
1461 : * Check if the latch is set already. If so, leave the loop
1462 : * immediately, avoid blocking again. We don't attempt to report any
1463 : * other events that might also be satisfied.
1464 : *
1465 : * If someone sets the latch between this and the
1466 : * WaitEventSetWaitBlock() below, the setter will write a byte to the
1467 : * pipe (or signal us and the signal handler will do that), and the
1468 : * readiness routine will return immediately.
1469 : *
1470 : * On unix, If there's a pending byte in the self pipe, we'll notice
1471 : * whenever blocking. Only clearing the pipe in that case avoids
1472 : * having to drain it every time WaitLatchOrSocket() is used. Should
1473 : * the pipe-buffer fill up we're still ok, because the pipe is in
1474 : * nonblocking mode. It's unlikely for that to happen, because the
1475 : * self pipe isn't filled unless we're blocking (waiting = true), or
1476 : * from inside a signal handler in latch_sigurg_handler().
1477 : *
1478 : * On windows, we'll also notice if there's a pending event for the
1479 : * latch when blocking, but there's no danger of anything filling up,
1480 : * as "Setting an event that is already set has no effect.".
1481 : *
1482 : * Note: we assume that the kernel calls involved in latch management
1483 : * will provide adequate synchronization on machines with weak memory
1484 : * ordering, so that we cannot miss seeing is_set if a notification
1485 : * has already been queued.
1486 : */
1487 2131772 : if (set->latch && !set->latch->is_set)
1488 : {
1489 : /* about to sleep on a latch */
1490 1731832 : set->latch->maybe_sleeping = true;
1491 1731832 : pg_memory_barrier();
1492 : /* and recheck */
1493 : }
1494 :
1495 2131772 : if (set->latch && set->latch->is_set)
1496 : {
1497 399358 : occurred_events->fd = PGINVALID_SOCKET;
1498 399358 : occurred_events->pos = set->latch_pos;
1499 399358 : occurred_events->user_data =
1500 399358 : set->events[set->latch_pos].user_data;
1501 399358 : occurred_events->events = WL_LATCH_SET;
1502 399358 : occurred_events++;
1503 399358 : returned_events++;
1504 :
1505 : /* could have been set above */
1506 399358 : set->latch->maybe_sleeping = false;
1507 :
1508 399358 : break;
1509 : }
1510 :
1511 : /*
1512 : * Wait for events using the readiness primitive chosen at the top of
1513 : * this file. If -1 is returned, a timeout has occurred, if 0 we have
1514 : * to retry, everything >= 1 is the number of returned events.
1515 : */
1516 1732414 : rc = WaitEventSetWaitBlock(set, cur_timeout,
1517 : occurred_events, nevents);
1518 :
1519 1732396 : if (set->latch)
1520 : {
1521 : Assert(set->latch->maybe_sleeping);
1522 1731750 : set->latch->maybe_sleeping = false;
1523 : }
1524 :
1525 1732396 : if (rc == -1)
1526 42134 : break; /* timeout occurred */
1527 : else
1528 1690262 : returned_events = rc;
1529 :
1530 : /* If we're not done, update cur_timeout for next iteration */
1531 1690262 : if (returned_events == 0 && timeout >= 0)
1532 : {
1533 655902 : INSTR_TIME_SET_CURRENT(cur_time);
1534 655902 : INSTR_TIME_SUBTRACT(cur_time, start_time);
1535 655902 : cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1536 655902 : if (cur_timeout <= 0)
1537 6 : break;
1538 : }
1539 : }
1540 : #ifndef WIN32
1541 1462046 : waiting = false;
1542 : #endif
1543 :
1544 1462046 : pgstat_report_wait_end();
1545 :
1546 1462046 : return returned_events;
1547 : }
1548 :
1549 :
1550 : #if defined(WAIT_USE_EPOLL)
1551 :
1552 : /*
1553 : * Wait using linux's epoll_wait(2).
1554 : *
1555 : * This is the preferable wait method, as several readiness notifications are
1556 : * delivered, without having to iterate through all of set->events. The return
1557 : * epoll_event struct contain a pointer to our events, making association
1558 : * easy.
1559 : */
1560 : static inline int
1561 1732414 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1562 : WaitEvent *occurred_events, int nevents)
1563 : {
1564 1732414 : int returned_events = 0;
1565 : int rc;
1566 : WaitEvent *cur_event;
1567 : struct epoll_event *cur_epoll_event;
1568 :
1569 : /* Sleep */
1570 1732414 : rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1571 1732414 : Min(nevents, set->nevents_space), cur_timeout);
1572 :
1573 : /* Check return code */
1574 1732414 : if (rc < 0)
1575 : {
1576 : /* EINTR is okay, otherwise complain */
1577 338238 : if (errno != EINTR)
1578 : {
1579 0 : waiting = false;
1580 0 : ereport(ERROR,
1581 : (errcode_for_socket_access(),
1582 : errmsg("%s() failed: %m",
1583 : "epoll_wait")));
1584 : }
1585 338238 : return 0;
1586 : }
1587 1394176 : else if (rc == 0)
1588 : {
1589 : /* timeout exceeded */
1590 42134 : return -1;
1591 : }
1592 :
1593 : /*
1594 : * At least one event occurred, iterate over the returned epoll events
1595 : * until they're either all processed, or we've returned all the events
1596 : * the caller desired.
1597 : */
1598 1352042 : for (cur_epoll_event = set->epoll_ret_events;
1599 2705224 : cur_epoll_event < (set->epoll_ret_events + rc) &&
1600 : returned_events < nevents;
1601 1353182 : cur_epoll_event++)
1602 : {
1603 : /* epoll's data pointer is set to the associated WaitEvent */
1604 1353200 : cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1605 :
1606 1353200 : occurred_events->pos = cur_event->pos;
1607 1353200 : occurred_events->user_data = cur_event->user_data;
1608 1353200 : occurred_events->events = 0;
1609 :
1610 1353200 : if (cur_event->events == WL_LATCH_SET &&
1611 873428 : cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1612 : {
1613 : /* Drain the signalfd. */
1614 873428 : drain();
1615 :
1616 873428 : if (set->latch && set->latch->is_set)
1617 : {
1618 540890 : occurred_events->fd = PGINVALID_SOCKET;
1619 540890 : occurred_events->events = WL_LATCH_SET;
1620 540890 : occurred_events++;
1621 540890 : returned_events++;
1622 : }
1623 : }
1624 479772 : else if (cur_event->events == WL_POSTMASTER_DEATH &&
1625 18 : cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1626 : {
1627 : /*
1628 : * We expect an EPOLLHUP when the remote end is closed, but
1629 : * because we don't expect the pipe to become readable or to have
1630 : * any errors either, treat those cases as postmaster death, too.
1631 : *
1632 : * Be paranoid about a spurious event signaling the postmaster as
1633 : * being dead. There have been reports about that happening with
1634 : * older primitives (select(2) to be specific), and a spurious
1635 : * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1636 : * cost much.
1637 : */
1638 18 : if (!PostmasterIsAliveInternal())
1639 : {
1640 18 : if (set->exit_on_postmaster_death)
1641 18 : proc_exit(1);
1642 0 : occurred_events->fd = PGINVALID_SOCKET;
1643 0 : occurred_events->events = WL_POSTMASTER_DEATH;
1644 0 : occurred_events++;
1645 0 : returned_events++;
1646 : }
1647 : }
1648 479754 : else if (cur_event->events & (WL_SOCKET_READABLE |
1649 : WL_SOCKET_WRITEABLE |
1650 : WL_SOCKET_CLOSED))
1651 : {
1652 : Assert(cur_event->fd != PGINVALID_SOCKET);
1653 :
1654 479754 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1655 459104 : (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1656 : {
1657 : /* data available in socket, or EOF */
1658 428568 : occurred_events->events |= WL_SOCKET_READABLE;
1659 : }
1660 :
1661 479754 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1662 53870 : (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1663 : {
1664 : /* writable, or EOF */
1665 52872 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1666 : }
1667 :
1668 479754 : if ((cur_event->events & WL_SOCKET_CLOSED) &&
1669 0 : (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
1670 : {
1671 : /* remote peer shut down, or error */
1672 0 : occurred_events->events |= WL_SOCKET_CLOSED;
1673 : }
1674 :
1675 479754 : if (occurred_events->events != 0)
1676 : {
1677 479754 : occurred_events->fd = cur_event->fd;
1678 479754 : occurred_events++;
1679 479754 : returned_events++;
1680 : }
1681 : }
1682 : }
1683 :
1684 1352024 : return returned_events;
1685 : }
1686 :
1687 : #elif defined(WAIT_USE_KQUEUE)
1688 :
1689 : /*
1690 : * Wait using kevent(2) on BSD-family systems and macOS.
1691 : *
1692 : * For now this mirrors the epoll code, but in future it could modify the fd
1693 : * set in the same call to kevent as it uses for waiting instead of doing that
1694 : * with separate system calls.
1695 : */
1696 : static int
1697 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1698 : WaitEvent *occurred_events, int nevents)
1699 : {
1700 : int returned_events = 0;
1701 : int rc;
1702 : WaitEvent *cur_event;
1703 : struct kevent *cur_kqueue_event;
1704 : struct timespec timeout;
1705 : struct timespec *timeout_p;
1706 :
1707 : if (cur_timeout < 0)
1708 : timeout_p = NULL;
1709 : else
1710 : {
1711 : timeout.tv_sec = cur_timeout / 1000;
1712 : timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
1713 : timeout_p = &timeout;
1714 : }
1715 :
1716 : /*
1717 : * Report postmaster events discovered by WaitEventAdjustKqueue() or an
1718 : * earlier call to WaitEventSetWait().
1719 : */
1720 : if (unlikely(set->report_postmaster_not_running))
1721 : {
1722 : if (set->exit_on_postmaster_death)
1723 : proc_exit(1);
1724 : occurred_events->fd = PGINVALID_SOCKET;
1725 : occurred_events->events = WL_POSTMASTER_DEATH;
1726 : return 1;
1727 : }
1728 :
1729 : /* Sleep */
1730 : rc = kevent(set->kqueue_fd, NULL, 0,
1731 : set->kqueue_ret_events,
1732 : Min(nevents, set->nevents_space),
1733 : timeout_p);
1734 :
1735 : /* Check return code */
1736 : if (rc < 0)
1737 : {
1738 : /* EINTR is okay, otherwise complain */
1739 : if (errno != EINTR)
1740 : {
1741 : waiting = false;
1742 : ereport(ERROR,
1743 : (errcode_for_socket_access(),
1744 : errmsg("%s() failed: %m",
1745 : "kevent")));
1746 : }
1747 : return 0;
1748 : }
1749 : else if (rc == 0)
1750 : {
1751 : /* timeout exceeded */
1752 : return -1;
1753 : }
1754 :
1755 : /*
1756 : * At least one event occurred, iterate over the returned kqueue events
1757 : * until they're either all processed, or we've returned all the events
1758 : * the caller desired.
1759 : */
1760 : for (cur_kqueue_event = set->kqueue_ret_events;
1761 : cur_kqueue_event < (set->kqueue_ret_events + rc) &&
1762 : returned_events < nevents;
1763 : cur_kqueue_event++)
1764 : {
1765 : /* kevent's udata points to the associated WaitEvent */
1766 : cur_event = AccessWaitEvent(cur_kqueue_event);
1767 :
1768 : occurred_events->pos = cur_event->pos;
1769 : occurred_events->user_data = cur_event->user_data;
1770 : occurred_events->events = 0;
1771 :
1772 : if (cur_event->events == WL_LATCH_SET &&
1773 : cur_kqueue_event->filter == EVFILT_SIGNAL)
1774 : {
1775 : if (set->latch && set->latch->is_set)
1776 : {
1777 : occurred_events->fd = PGINVALID_SOCKET;
1778 : occurred_events->events = WL_LATCH_SET;
1779 : occurred_events++;
1780 : returned_events++;
1781 : }
1782 : }
1783 : else if (cur_event->events == WL_POSTMASTER_DEATH &&
1784 : cur_kqueue_event->filter == EVFILT_PROC &&
1785 : (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
1786 : {
1787 : /*
1788 : * The kernel will tell this kqueue object only once about the
1789 : * exit of the postmaster, so let's remember that for next time so
1790 : * that we provide level-triggered semantics.
1791 : */
1792 : set->report_postmaster_not_running = true;
1793 :
1794 : if (set->exit_on_postmaster_death)
1795 : proc_exit(1);
1796 : occurred_events->fd = PGINVALID_SOCKET;
1797 : occurred_events->events = WL_POSTMASTER_DEATH;
1798 : occurred_events++;
1799 : returned_events++;
1800 : }
1801 : else if (cur_event->events & (WL_SOCKET_READABLE |
1802 : WL_SOCKET_WRITEABLE |
1803 : WL_SOCKET_CLOSED))
1804 : {
1805 : Assert(cur_event->fd >= 0);
1806 :
1807 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1808 : (cur_kqueue_event->filter == EVFILT_READ))
1809 : {
1810 : /* readable, or EOF */
1811 : occurred_events->events |= WL_SOCKET_READABLE;
1812 : }
1813 :
1814 : if ((cur_event->events & WL_SOCKET_CLOSED) &&
1815 : (cur_kqueue_event->filter == EVFILT_READ) &&
1816 : (cur_kqueue_event->flags & EV_EOF))
1817 : {
1818 : /* the remote peer has shut down */
1819 : occurred_events->events |= WL_SOCKET_CLOSED;
1820 : }
1821 :
1822 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1823 : (cur_kqueue_event->filter == EVFILT_WRITE))
1824 : {
1825 : /* writable, or EOF */
1826 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1827 : }
1828 :
1829 : if (occurred_events->events != 0)
1830 : {
1831 : occurred_events->fd = cur_event->fd;
1832 : occurred_events++;
1833 : returned_events++;
1834 : }
1835 : }
1836 : }
1837 :
1838 : return returned_events;
1839 : }
1840 :
1841 : #elif defined(WAIT_USE_POLL)
1842 :
1843 : /*
1844 : * Wait using poll(2).
1845 : *
1846 : * This allows to receive readiness notifications for several events at once,
1847 : * but requires iterating through all of set->pollfds.
1848 : */
1849 : static inline int
1850 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1851 : WaitEvent *occurred_events, int nevents)
1852 : {
1853 : int returned_events = 0;
1854 : int rc;
1855 : WaitEvent *cur_event;
1856 : struct pollfd *cur_pollfd;
1857 :
1858 : /* Sleep */
1859 : rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1860 :
1861 : /* Check return code */
1862 : if (rc < 0)
1863 : {
1864 : /* EINTR is okay, otherwise complain */
1865 : if (errno != EINTR)
1866 : {
1867 : waiting = false;
1868 : ereport(ERROR,
1869 : (errcode_for_socket_access(),
1870 : errmsg("%s() failed: %m",
1871 : "poll")));
1872 : }
1873 : return 0;
1874 : }
1875 : else if (rc == 0)
1876 : {
1877 : /* timeout exceeded */
1878 : return -1;
1879 : }
1880 :
1881 : for (cur_event = set->events, cur_pollfd = set->pollfds;
1882 : cur_event < (set->events + set->nevents) &&
1883 : returned_events < nevents;
1884 : cur_event++, cur_pollfd++)
1885 : {
1886 : /* no activity on this FD, skip */
1887 : if (cur_pollfd->revents == 0)
1888 : continue;
1889 :
1890 : occurred_events->pos = cur_event->pos;
1891 : occurred_events->user_data = cur_event->user_data;
1892 : occurred_events->events = 0;
1893 :
1894 : if (cur_event->events == WL_LATCH_SET &&
1895 : (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1896 : {
1897 : /* There's data in the self-pipe, clear it. */
1898 : drain();
1899 :
1900 : if (set->latch && set->latch->is_set)
1901 : {
1902 : occurred_events->fd = PGINVALID_SOCKET;
1903 : occurred_events->events = WL_LATCH_SET;
1904 : occurred_events++;
1905 : returned_events++;
1906 : }
1907 : }
1908 : else if (cur_event->events == WL_POSTMASTER_DEATH &&
1909 : (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1910 : {
1911 : /*
1912 : * We expect an POLLHUP when the remote end is closed, but because
1913 : * we don't expect the pipe to become readable or to have any
1914 : * errors either, treat those cases as postmaster death, too.
1915 : *
1916 : * Be paranoid about a spurious event signaling the postmaster as
1917 : * being dead. There have been reports about that happening with
1918 : * older primitives (select(2) to be specific), and a spurious
1919 : * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1920 : * cost much.
1921 : */
1922 : if (!PostmasterIsAliveInternal())
1923 : {
1924 : if (set->exit_on_postmaster_death)
1925 : proc_exit(1);
1926 : occurred_events->fd = PGINVALID_SOCKET;
1927 : occurred_events->events = WL_POSTMASTER_DEATH;
1928 : occurred_events++;
1929 : returned_events++;
1930 : }
1931 : }
1932 : else if (cur_event->events & (WL_SOCKET_READABLE |
1933 : WL_SOCKET_WRITEABLE |
1934 : WL_SOCKET_CLOSED))
1935 : {
1936 : int errflags = POLLHUP | POLLERR | POLLNVAL;
1937 :
1938 : Assert(cur_event->fd >= PGINVALID_SOCKET);
1939 :
1940 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1941 : (cur_pollfd->revents & (POLLIN | errflags)))
1942 : {
1943 : /* data available in socket, or EOF */
1944 : occurred_events->events |= WL_SOCKET_READABLE;
1945 : }
1946 :
1947 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1948 : (cur_pollfd->revents & (POLLOUT | errflags)))
1949 : {
1950 : /* writeable, or EOF */
1951 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1952 : }
1953 :
1954 : #ifdef POLLRDHUP
1955 : if ((cur_event->events & WL_SOCKET_CLOSED) &&
1956 : (cur_pollfd->revents & (POLLRDHUP | errflags)))
1957 : {
1958 : /* remote peer closed, or error */
1959 : occurred_events->events |= WL_SOCKET_CLOSED;
1960 : }
1961 : #endif
1962 :
1963 : if (occurred_events->events != 0)
1964 : {
1965 : occurred_events->fd = cur_event->fd;
1966 : occurred_events++;
1967 : returned_events++;
1968 : }
1969 : }
1970 : }
1971 : return returned_events;
1972 : }
1973 :
1974 : #elif defined(WAIT_USE_WIN32)
1975 :
1976 : /*
1977 : * Wait using Windows' WaitForMultipleObjects(). Each call only "consumes" one
1978 : * event, so we keep calling until we've filled up our output buffer to match
1979 : * the behavior of the other implementations.
1980 : *
1981 : * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273
1982 : */
1983 : static inline int
1984 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1985 : WaitEvent *occurred_events, int nevents)
1986 : {
1987 : int returned_events = 0;
1988 : DWORD rc;
1989 : WaitEvent *cur_event;
1990 :
1991 : /* Reset any wait events that need it */
1992 : for (cur_event = set->events;
1993 : cur_event < (set->events + set->nevents);
1994 : cur_event++)
1995 : {
1996 : if (cur_event->reset)
1997 : {
1998 : WaitEventAdjustWin32(set, cur_event);
1999 : cur_event->reset = false;
2000 : }
2001 :
2002 : /*
2003 : * We associate the socket with a new event handle for each
2004 : * WaitEventSet. FD_CLOSE is only generated once if the other end
2005 : * closes gracefully. Therefore we might miss the FD_CLOSE
2006 : * notification, if it was delivered to another event after we stopped
2007 : * waiting for it. Close that race by peeking for EOF after setting
2008 : * up this handle to receive notifications, and before entering the
2009 : * sleep.
2010 : *
2011 : * XXX If we had one event handle for the lifetime of a socket, we
2012 : * wouldn't need this.
2013 : */
2014 : if (cur_event->events & WL_SOCKET_READABLE)
2015 : {
2016 : char c;
2017 : WSABUF buf;
2018 : DWORD received;
2019 : DWORD flags;
2020 :
2021 : buf.buf = &c;
2022 : buf.len = 1;
2023 : flags = MSG_PEEK;
2024 : if (WSARecv(cur_event->fd, &buf, 1, &received, &flags, NULL, NULL) == 0)
2025 : {
2026 : occurred_events->pos = cur_event->pos;
2027 : occurred_events->user_data = cur_event->user_data;
2028 : occurred_events->events = WL_SOCKET_READABLE;
2029 : occurred_events->fd = cur_event->fd;
2030 : return 1;
2031 : }
2032 : }
2033 :
2034 : /*
2035 : * Windows does not guarantee to log an FD_WRITE network event
2036 : * indicating that more data can be sent unless the previous send()
2037 : * failed with WSAEWOULDBLOCK. While our caller might well have made
2038 : * such a call, we cannot assume that here. Therefore, if waiting for
2039 : * write-ready, force the issue by doing a dummy send(). If the dummy
2040 : * send() succeeds, assume that the socket is in fact write-ready, and
2041 : * return immediately. Also, if it fails with something other than
2042 : * WSAEWOULDBLOCK, return a write-ready indication to let our caller
2043 : * deal with the error condition.
2044 : */
2045 : if (cur_event->events & WL_SOCKET_WRITEABLE)
2046 : {
2047 : char c;
2048 : WSABUF buf;
2049 : DWORD sent;
2050 : int r;
2051 :
2052 : buf.buf = &c;
2053 : buf.len = 0;
2054 :
2055 : r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
2056 : if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
2057 : {
2058 : occurred_events->pos = cur_event->pos;
2059 : occurred_events->user_data = cur_event->user_data;
2060 : occurred_events->events = WL_SOCKET_WRITEABLE;
2061 : occurred_events->fd = cur_event->fd;
2062 : return 1;
2063 : }
2064 : }
2065 : }
2066 :
2067 : /*
2068 : * Sleep.
2069 : *
2070 : * Need to wait for ->nevents + 1, because signal handle is in [0].
2071 : */
2072 : rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
2073 : cur_timeout);
2074 :
2075 : /* Check return code */
2076 : if (rc == WAIT_FAILED)
2077 : elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
2078 : GetLastError());
2079 : else if (rc == WAIT_TIMEOUT)
2080 : {
2081 : /* timeout exceeded */
2082 : return -1;
2083 : }
2084 :
2085 : if (rc == WAIT_OBJECT_0)
2086 : {
2087 : /* Service newly-arrived signals */
2088 : pgwin32_dispatch_queued_signals();
2089 : return 0; /* retry */
2090 : }
2091 :
2092 : /*
2093 : * With an offset of one, due to the always present pgwin32_signal_event,
2094 : * the handle offset directly corresponds to a wait event.
2095 : */
2096 : cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
2097 :
2098 : for (;;)
2099 : {
2100 : int next_pos;
2101 : int count;
2102 :
2103 : occurred_events->pos = cur_event->pos;
2104 : occurred_events->user_data = cur_event->user_data;
2105 : occurred_events->events = 0;
2106 :
2107 : if (cur_event->events == WL_LATCH_SET)
2108 : {
2109 : /*
2110 : * We cannot use set->latch->event to reset the fired event if we
2111 : * aren't waiting on this latch now.
2112 : */
2113 : if (!ResetEvent(set->handles[cur_event->pos + 1]))
2114 : elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
2115 :
2116 : if (set->latch && set->latch->is_set)
2117 : {
2118 : occurred_events->fd = PGINVALID_SOCKET;
2119 : occurred_events->events = WL_LATCH_SET;
2120 : occurred_events++;
2121 : returned_events++;
2122 : }
2123 : }
2124 : else if (cur_event->events == WL_POSTMASTER_DEATH)
2125 : {
2126 : /*
2127 : * Postmaster apparently died. Since the consequences of falsely
2128 : * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we
2129 : * take the trouble to positively verify this with
2130 : * PostmasterIsAlive(), even though there is no known reason to
2131 : * think that the event could be falsely set on Windows.
2132 : */
2133 : if (!PostmasterIsAliveInternal())
2134 : {
2135 : if (set->exit_on_postmaster_death)
2136 : proc_exit(1);
2137 : occurred_events->fd = PGINVALID_SOCKET;
2138 : occurred_events->events = WL_POSTMASTER_DEATH;
2139 : occurred_events++;
2140 : returned_events++;
2141 : }
2142 : }
2143 : else if (cur_event->events & WL_SOCKET_MASK)
2144 : {
2145 : WSANETWORKEVENTS resEvents;
2146 : HANDLE handle = set->handles[cur_event->pos + 1];
2147 :
2148 : Assert(cur_event->fd);
2149 :
2150 : occurred_events->fd = cur_event->fd;
2151 :
2152 : ZeroMemory(&resEvents, sizeof(resEvents));
2153 : if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
2154 : elog(ERROR, "failed to enumerate network events: error code %d",
2155 : WSAGetLastError());
2156 : if ((cur_event->events & WL_SOCKET_READABLE) &&
2157 : (resEvents.lNetworkEvents & FD_READ))
2158 : {
2159 : /* data available in socket */
2160 : occurred_events->events |= WL_SOCKET_READABLE;
2161 :
2162 : /*------
2163 : * WaitForMultipleObjects doesn't guarantee that a read event
2164 : * will be returned if the latch is set at the same time. Even
2165 : * if it did, the caller might drop that event expecting it to
2166 : * reoccur on next call. So, we must force the event to be
2167 : * reset if this WaitEventSet is used again in order to avoid
2168 : * an indefinite hang.
2169 : *
2170 : * Refer
2171 : * https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
2172 : * for the behavior of socket events.
2173 : *------
2174 : */
2175 : cur_event->reset = true;
2176 : }
2177 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
2178 : (resEvents.lNetworkEvents & FD_WRITE))
2179 : {
2180 : /* writeable */
2181 : occurred_events->events |= WL_SOCKET_WRITEABLE;
2182 : }
2183 : if ((cur_event->events & WL_SOCKET_CONNECTED) &&
2184 : (resEvents.lNetworkEvents & FD_CONNECT))
2185 : {
2186 : /* connected */
2187 : occurred_events->events |= WL_SOCKET_CONNECTED;
2188 : }
2189 : if ((cur_event->events & WL_SOCKET_ACCEPT) &&
2190 : (resEvents.lNetworkEvents & FD_ACCEPT))
2191 : {
2192 : /* incoming connection could be accepted */
2193 : occurred_events->events |= WL_SOCKET_ACCEPT;
2194 : }
2195 : if (resEvents.lNetworkEvents & FD_CLOSE)
2196 : {
2197 : /* EOF/error, so signal all caller-requested socket flags */
2198 : occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
2199 : }
2200 :
2201 : if (occurred_events->events != 0)
2202 : {
2203 : occurred_events++;
2204 : returned_events++;
2205 : }
2206 : }
2207 :
2208 : /* Is the output buffer full? */
2209 : if (returned_events == nevents)
2210 : break;
2211 :
2212 : /* Have we run out of possible events? */
2213 : next_pos = cur_event->pos + 1;
2214 : if (next_pos == set->nevents)
2215 : break;
2216 :
2217 : /*
2218 : * Poll the rest of the event handles in the array starting at
2219 : * next_pos being careful to skip over the initial signal handle too.
2220 : * This time we use a zero timeout.
2221 : */
2222 : count = set->nevents - next_pos;
2223 : rc = WaitForMultipleObjects(count,
2224 : set->handles + 1 + next_pos,
2225 : false,
2226 : 0);
2227 :
2228 : /*
2229 : * We don't distinguish between errors and WAIT_TIMEOUT here because
2230 : * we already have events to report.
2231 : */
2232 : if (rc < WAIT_OBJECT_0 || rc >= WAIT_OBJECT_0 + count)
2233 : break;
2234 :
2235 : /* We have another event to decode. */
2236 : cur_event = &set->events[next_pos + (rc - WAIT_OBJECT_0)];
2237 : }
2238 :
2239 : return returned_events;
2240 : }
2241 : #endif
2242 :
2243 : /*
2244 : * Return whether the current build options can report WL_SOCKET_CLOSED.
2245 : */
2246 : bool
2247 1966 : WaitEventSetCanReportClosed(void)
2248 : {
2249 : #if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
2250 : defined(WAIT_USE_EPOLL) || \
2251 : defined(WAIT_USE_KQUEUE)
2252 1966 : return true;
2253 : #else
2254 : return false;
2255 : #endif
2256 : }
2257 :
2258 : /*
2259 : * Get the number of wait events registered in a given WaitEventSet.
2260 : */
2261 : int
2262 220 : GetNumRegisteredWaitEvents(WaitEventSet *set)
2263 : {
2264 220 : return set->nevents;
2265 : }
2266 :
2267 : #if defined(WAIT_USE_SELF_PIPE)
2268 :
2269 : /*
2270 : * SetLatch uses SIGURG to wake up the process waiting on the latch.
2271 : *
2272 : * Wake up WaitLatch, if we're waiting.
2273 : */
2274 : static void
2275 : latch_sigurg_handler(SIGNAL_ARGS)
2276 : {
2277 : if (waiting)
2278 : sendSelfPipeByte();
2279 : }
2280 :
2281 : /* Send one byte to the self-pipe, to wake up WaitLatch */
2282 : static void
2283 : sendSelfPipeByte(void)
2284 : {
2285 : int rc;
2286 : char dummy = 0;
2287 :
2288 : retry:
2289 : rc = write(selfpipe_writefd, &dummy, 1);
2290 : if (rc < 0)
2291 : {
2292 : /* If interrupted by signal, just retry */
2293 : if (errno == EINTR)
2294 : goto retry;
2295 :
2296 : /*
2297 : * If the pipe is full, we don't need to retry, the data that's there
2298 : * already is enough to wake up WaitLatch.
2299 : */
2300 : if (errno == EAGAIN || errno == EWOULDBLOCK)
2301 : return;
2302 :
2303 : /*
2304 : * Oops, the write() failed for some other reason. We might be in a
2305 : * signal handler, so it's not safe to elog(). We have no choice but
2306 : * silently ignore the error.
2307 : */
2308 : return;
2309 : }
2310 : }
2311 :
2312 : #endif
2313 :
2314 : #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
2315 :
2316 : /*
2317 : * Read all available data from self-pipe or signalfd.
2318 : *
2319 : * Note: this is only called when waiting = true. If it fails and doesn't
2320 : * return, it must reset that flag first (though ideally, this will never
2321 : * happen).
2322 : */
2323 : static void
2324 873428 : drain(void)
2325 : {
2326 : char buf[1024];
2327 : int rc;
2328 : int fd;
2329 :
2330 : #ifdef WAIT_USE_SELF_PIPE
2331 : fd = selfpipe_readfd;
2332 : #else
2333 873428 : fd = signal_fd;
2334 : #endif
2335 :
2336 : for (;;)
2337 : {
2338 873428 : rc = read(fd, buf, sizeof(buf));
2339 873428 : if (rc < 0)
2340 : {
2341 0 : if (errno == EAGAIN || errno == EWOULDBLOCK)
2342 : break; /* the descriptor is empty */
2343 0 : else if (errno == EINTR)
2344 0 : continue; /* retry */
2345 : else
2346 : {
2347 0 : waiting = false;
2348 : #ifdef WAIT_USE_SELF_PIPE
2349 : elog(ERROR, "read() on self-pipe failed: %m");
2350 : #else
2351 0 : elog(ERROR, "read() on signalfd failed: %m");
2352 : #endif
2353 : }
2354 : }
2355 873428 : else if (rc == 0)
2356 : {
2357 0 : waiting = false;
2358 : #ifdef WAIT_USE_SELF_PIPE
2359 : elog(ERROR, "unexpected EOF on self-pipe");
2360 : #else
2361 0 : elog(ERROR, "unexpected EOF on signalfd");
2362 : #endif
2363 : }
2364 873428 : else if (rc < sizeof(buf))
2365 : {
2366 : /* we successfully drained the pipe; no need to read() again */
2367 873428 : break;
2368 : }
2369 : /* else buffer wasn't big enough, so read again */
2370 : }
2371 873428 : }
2372 :
2373 : #endif
2374 :
2375 : static void
2376 2 : ResOwnerReleaseWaitEventSet(Datum res)
2377 : {
2378 2 : WaitEventSet *set = (WaitEventSet *) DatumGetPointer(res);
2379 :
2380 : Assert(set->owner != NULL);
2381 2 : set->owner = NULL;
2382 2 : FreeWaitEventSet(set);
2383 2 : }
|