Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * latch.c
4 : * Routines for inter-process latches
5 : *
6 : * The poll() implementation uses the so-called self-pipe trick to overcome the
7 : * race condition involved with poll() and setting a global flag in the signal
8 : * handler. When a latch is set and the current process is waiting for it, the
9 : * signal handler wakes up the poll() in WaitLatch by writing a byte to a pipe.
10 : * A signal by itself doesn't interrupt poll() on all platforms, and even on
11 : * platforms where it does, a signal that arrives just before the poll() call
12 : * does not prevent poll() from entering sleep. An incoming byte on a pipe
13 : * however reliably interrupts the sleep, and causes poll() to return
14 : * immediately even if the signal arrives before poll() begins.
15 : *
16 : * The epoll() implementation overcomes the race with a different technique: it
17 : * keeps SIGURG blocked and consumes from a signalfd() descriptor instead. We
18 : * don't need to register a signal handler or create our own self-pipe. We
19 : * assume that any system that has Linux epoll() also has Linux signalfd().
20 : *
21 : * The kqueue() implementation waits for SIGURG with EVFILT_SIGNAL.
22 : *
23 : * The Windows implementation uses Windows events that are inherited by all
24 : * postmaster child processes. There's no need for the self-pipe trick there.
25 : *
26 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : * IDENTIFICATION
30 : * src/backend/storage/ipc/latch.c
31 : *
32 : *-------------------------------------------------------------------------
33 : */
34 : #include "postgres.h"
35 :
36 : #include <fcntl.h>
37 : #include <limits.h>
38 : #include <signal.h>
39 : #include <unistd.h>
40 : #ifdef HAVE_SYS_EPOLL_H
41 : #include <sys/epoll.h>
42 : #endif
43 : #ifdef HAVE_SYS_EVENT_H
44 : #include <sys/event.h>
45 : #endif
46 : #ifdef HAVE_SYS_SIGNALFD_H
47 : #include <sys/signalfd.h>
48 : #endif
49 : #ifdef HAVE_POLL_H
50 : #include <poll.h>
51 : #endif
52 :
53 : #include "libpq/pqsignal.h"
54 : #include "miscadmin.h"
55 : #include "pgstat.h"
56 : #include "port/atomics.h"
57 : #include "portability/instr_time.h"
58 : #include "postmaster/postmaster.h"
59 : #include "storage/fd.h"
60 : #include "storage/ipc.h"
61 : #include "storage/latch.h"
62 : #include "storage/pmsignal.h"
63 : #include "utils/memutils.h"
64 : #include "utils/resowner.h"
65 :
66 : /*
67 : * Select the fd readiness primitive to use. Normally the "most modern"
68 : * primitive supported by the OS will be used, but for testing it can be
69 : * useful to manually specify the used primitive. If desired, just add a
70 : * define somewhere before this block.
71 : */
72 : #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
73 : defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
74 : /* don't overwrite manual choice */
75 : #elif defined(HAVE_SYS_EPOLL_H)
76 : #define WAIT_USE_EPOLL
77 : #elif defined(HAVE_KQUEUE)
78 : #define WAIT_USE_KQUEUE
79 : #elif defined(HAVE_POLL)
80 : #define WAIT_USE_POLL
81 : #elif WIN32
82 : #define WAIT_USE_WIN32
83 : #else
84 : #error "no wait set implementation available"
85 : #endif
86 :
87 : /*
88 : * By default, we use a self-pipe with poll() and a signalfd with epoll(), if
89 : * available. For testing the choice can also be manually specified.
90 : */
91 : #if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL)
92 : #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
93 : /* don't overwrite manual choice */
94 : #elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H)
95 : #define WAIT_USE_SIGNALFD
96 : #else
97 : #define WAIT_USE_SELF_PIPE
98 : #endif
99 : #endif
100 :
101 : /* typedef in latch.h */
102 : struct WaitEventSet
103 : {
104 : ResourceOwner owner;
105 :
106 : int nevents; /* number of registered events */
107 : int nevents_space; /* maximum number of events in this set */
108 :
109 : /*
110 : * Array, of nevents_space length, storing the definition of events this
111 : * set is waiting for.
112 : */
113 : WaitEvent *events;
114 :
115 : /*
116 : * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
117 : * said latch, and latch_pos the offset in the ->events array. This is
118 : * useful because we check the state of the latch before performing doing
119 : * syscalls related to waiting.
120 : */
121 : Latch *latch;
122 : int latch_pos;
123 :
124 : /*
125 : * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
126 : * is set so that we'll exit immediately if postmaster death is detected,
127 : * instead of returning.
128 : */
129 : bool exit_on_postmaster_death;
130 :
131 : #if defined(WAIT_USE_EPOLL)
132 : int epoll_fd;
133 : /* epoll_wait returns events in a user provided arrays, allocate once */
134 : struct epoll_event *epoll_ret_events;
135 : #elif defined(WAIT_USE_KQUEUE)
136 : int kqueue_fd;
137 : /* kevent returns events in a user provided arrays, allocate once */
138 : struct kevent *kqueue_ret_events;
139 : bool report_postmaster_not_running;
140 : #elif defined(WAIT_USE_POLL)
141 : /* poll expects events to be waited on every poll() call, prepare once */
142 : struct pollfd *pollfds;
143 : #elif defined(WAIT_USE_WIN32)
144 :
145 : /*
146 : * Array of windows events. The first element always contains
147 : * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
148 : * event->pos + 1).
149 : */
150 : HANDLE *handles;
151 : #endif
152 : };
153 :
154 : /* A common WaitEventSet used to implement WaitLatch() */
155 : static WaitEventSet *LatchWaitSet;
156 :
157 : /* The position of the latch in LatchWaitSet. */
158 : #define LatchWaitSetLatchPos 0
159 :
160 : #ifndef WIN32
161 : /* Are we currently in WaitLatch? The signal handler would like to know. */
162 : static volatile sig_atomic_t waiting = false;
163 : #endif
164 :
165 : #ifdef WAIT_USE_SIGNALFD
166 : /* On Linux, we'll receive SIGURG via a signalfd file descriptor. */
167 : static int signal_fd = -1;
168 : #endif
169 :
170 : #ifdef WAIT_USE_SELF_PIPE
171 : /* Read and write ends of the self-pipe */
172 : static int selfpipe_readfd = -1;
173 : static int selfpipe_writefd = -1;
174 :
175 : /* Process owning the self-pipe --- needed for checking purposes */
176 : static int selfpipe_owner_pid = 0;
177 :
178 : /* Private function prototypes */
179 : static void latch_sigurg_handler(SIGNAL_ARGS);
180 : static void sendSelfPipeByte(void);
181 : #endif
182 :
183 : #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
184 : static void drain(void);
185 : #endif
186 :
187 : #if defined(WAIT_USE_EPOLL)
188 : static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
189 : #elif defined(WAIT_USE_KQUEUE)
190 : static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
191 : #elif defined(WAIT_USE_POLL)
192 : static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
193 : #elif defined(WAIT_USE_WIN32)
194 : static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
195 : #endif
196 :
197 : static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
198 : WaitEvent *occurred_events, int nevents);
199 :
200 : /* ResourceOwner support to hold WaitEventSets */
201 : static void ResOwnerReleaseWaitEventSet(Datum res);
202 :
203 : static const ResourceOwnerDesc wait_event_set_resowner_desc =
204 : {
205 : .name = "WaitEventSet",
206 : .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
207 : .release_priority = RELEASE_PRIO_WAITEVENTSETS,
208 : .ReleaseResource = ResOwnerReleaseWaitEventSet,
209 : .DebugPrint = NULL
210 : };
211 :
212 : /* Convenience wrappers over ResourceOwnerRemember/Forget */
213 : static inline void
214 87398 : ResourceOwnerRememberWaitEventSet(ResourceOwner owner, WaitEventSet *set)
215 : {
216 87398 : ResourceOwnerRemember(owner, PointerGetDatum(set), &wait_event_set_resowner_desc);
217 87398 : }
218 : static inline void
219 87396 : ResourceOwnerForgetWaitEventSet(ResourceOwner owner, WaitEventSet *set)
220 : {
221 87396 : ResourceOwnerForget(owner, PointerGetDatum(set), &wait_event_set_resowner_desc);
222 87396 : }
223 :
224 :
225 : /*
226 : * Initialize the process-local latch infrastructure.
227 : *
228 : * This must be called once during startup of any process that can wait on
229 : * latches, before it issues any InitLatch() or OwnLatch() calls.
230 : */
231 : void
232 39388 : InitializeLatchSupport(void)
233 : {
234 : #if defined(WAIT_USE_SELF_PIPE)
235 : int pipefd[2];
236 :
237 : if (IsUnderPostmaster)
238 : {
239 : /*
240 : * We might have inherited connections to a self-pipe created by the
241 : * postmaster. It's critical that child processes create their own
242 : * self-pipes, of course, and we really want them to close the
243 : * inherited FDs for safety's sake.
244 : */
245 : if (selfpipe_owner_pid != 0)
246 : {
247 : /* Assert we go through here but once in a child process */
248 : Assert(selfpipe_owner_pid != MyProcPid);
249 : /* Release postmaster's pipe FDs; ignore any error */
250 : (void) close(selfpipe_readfd);
251 : (void) close(selfpipe_writefd);
252 : /* Clean up, just for safety's sake; we'll set these below */
253 : selfpipe_readfd = selfpipe_writefd = -1;
254 : selfpipe_owner_pid = 0;
255 : /* Keep fd.c's accounting straight */
256 : ReleaseExternalFD();
257 : ReleaseExternalFD();
258 : }
259 : else
260 : {
261 : /*
262 : * Postmaster didn't create a self-pipe ... or else we're in an
263 : * EXEC_BACKEND build, in which case it doesn't matter since the
264 : * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
265 : * fd.c won't have state to clean up, either.
266 : */
267 : Assert(selfpipe_readfd == -1);
268 : }
269 : }
270 : else
271 : {
272 : /* In postmaster or standalone backend, assert we do this but once */
273 : Assert(selfpipe_readfd == -1);
274 : Assert(selfpipe_owner_pid == 0);
275 : }
276 :
277 : /*
278 : * Set up the self-pipe that allows a signal handler to wake up the
279 : * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
280 : * that SetLatch won't block if the event has already been set many times
281 : * filling the kernel buffer. Make the read-end non-blocking too, so that
282 : * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
283 : * Also, make both FDs close-on-exec, since we surely do not want any
284 : * child processes messing with them.
285 : */
286 : if (pipe(pipefd) < 0)
287 : elog(FATAL, "pipe() failed: %m");
288 : if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
289 : elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
290 : if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
291 : elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
292 : if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
293 : elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
294 : if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
295 : elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
296 :
297 : selfpipe_readfd = pipefd[0];
298 : selfpipe_writefd = pipefd[1];
299 : selfpipe_owner_pid = MyProcPid;
300 :
301 : /* Tell fd.c about these two long-lived FDs */
302 : ReserveExternalFD();
303 : ReserveExternalFD();
304 :
305 : pqsignal(SIGURG, latch_sigurg_handler);
306 : #endif
307 :
308 : #ifdef WAIT_USE_SIGNALFD
309 : sigset_t signalfd_mask;
310 :
311 39388 : if (IsUnderPostmaster)
312 : {
313 : /*
314 : * It would probably be safe to re-use the inherited signalfd since
315 : * signalfds only see the current process's pending signals, but it
316 : * seems less surprising to close it and create our own.
317 : */
318 37406 : if (signal_fd != -1)
319 : {
320 : /* Release postmaster's signal FD; ignore any error */
321 37406 : (void) close(signal_fd);
322 37406 : signal_fd = -1;
323 37406 : ReleaseExternalFD();
324 : }
325 : }
326 :
327 : /* Block SIGURG, because we'll receive it through a signalfd. */
328 39388 : sigaddset(&UnBlockSig, SIGURG);
329 :
330 : /* Set up the signalfd to receive SIGURG notifications. */
331 39388 : sigemptyset(&signalfd_mask);
332 39388 : sigaddset(&signalfd_mask, SIGURG);
333 39388 : signal_fd = signalfd(-1, &signalfd_mask, SFD_NONBLOCK | SFD_CLOEXEC);
334 39388 : if (signal_fd < 0)
335 0 : elog(FATAL, "signalfd() failed");
336 39388 : ReserveExternalFD();
337 : #endif
338 :
339 : #ifdef WAIT_USE_KQUEUE
340 : /* Ignore SIGURG, because we'll receive it via kqueue. */
341 : pqsignal(SIGURG, SIG_IGN);
342 : #endif
343 39388 : }
344 :
345 : void
346 37838 : InitializeLatchWaitSet(void)
347 : {
348 : int latch_pos PG_USED_FOR_ASSERTS_ONLY;
349 :
350 : Assert(LatchWaitSet == NULL);
351 :
352 : /* Set up the WaitEventSet used by WaitLatch(). */
353 37838 : LatchWaitSet = CreateWaitEventSet(NULL, 2);
354 37838 : latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
355 : MyLatch, NULL);
356 37838 : if (IsUnderPostmaster)
357 37406 : AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH,
358 : PGINVALID_SOCKET, NULL, NULL);
359 :
360 : Assert(latch_pos == LatchWaitSetLatchPos);
361 37838 : }
362 :
363 : void
364 0 : ShutdownLatchSupport(void)
365 : {
366 : #if defined(WAIT_USE_POLL)
367 : pqsignal(SIGURG, SIG_IGN);
368 : #endif
369 :
370 0 : if (LatchWaitSet)
371 : {
372 0 : FreeWaitEventSet(LatchWaitSet);
373 0 : LatchWaitSet = NULL;
374 : }
375 :
376 : #if defined(WAIT_USE_SELF_PIPE)
377 : close(selfpipe_readfd);
378 : close(selfpipe_writefd);
379 : selfpipe_readfd = -1;
380 : selfpipe_writefd = -1;
381 : selfpipe_owner_pid = InvalidPid;
382 : #endif
383 :
384 : #if defined(WAIT_USE_SIGNALFD)
385 0 : close(signal_fd);
386 0 : signal_fd = -1;
387 : #endif
388 0 : }
389 :
390 : /*
391 : * Initialize a process-local latch.
392 : */
393 : void
394 39388 : InitLatch(Latch *latch)
395 : {
396 39388 : latch->is_set = false;
397 39388 : latch->maybe_sleeping = false;
398 39388 : latch->owner_pid = MyProcPid;
399 39388 : latch->is_shared = false;
400 :
401 : #if defined(WAIT_USE_SELF_PIPE)
402 : /* Assert InitializeLatchSupport has been called in this process */
403 : Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
404 : #elif defined(WAIT_USE_SIGNALFD)
405 : /* Assert InitializeLatchSupport has been called in this process */
406 : Assert(signal_fd >= 0);
407 : #elif defined(WAIT_USE_WIN32)
408 : latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
409 : if (latch->event == NULL)
410 : elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
411 : #endif /* WIN32 */
412 39388 : }
413 :
414 : /*
415 : * Initialize a shared latch that can be set from other processes. The latch
416 : * is initially owned by no-one; use OwnLatch to associate it with the
417 : * current process.
418 : *
419 : * InitSharedLatch needs to be called in postmaster before forking child
420 : * processes, usually right after allocating the shared memory block
421 : * containing the latch with ShmemInitStruct. (The Unix implementation
422 : * doesn't actually require that, but the Windows one does.) Because of
423 : * this restriction, we have no concurrency issues to worry about here.
424 : *
425 : * Note that other handles created in this module are never marked as
426 : * inheritable. Thus we do not need to worry about cleaning up child
427 : * process references to postmaster-private latches or WaitEventSets.
428 : */
429 : void
430 190142 : InitSharedLatch(Latch *latch)
431 : {
432 : #ifdef WIN32
433 : SECURITY_ATTRIBUTES sa;
434 :
435 : /*
436 : * Set up security attributes to specify that the events are inherited.
437 : */
438 : ZeroMemory(&sa, sizeof(sa));
439 : sa.nLength = sizeof(sa);
440 : sa.bInheritHandle = TRUE;
441 :
442 : latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
443 : if (latch->event == NULL)
444 : elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
445 : #endif
446 :
447 190142 : latch->is_set = false;
448 190142 : latch->maybe_sleeping = false;
449 190142 : latch->owner_pid = 0;
450 190142 : latch->is_shared = true;
451 190142 : }
452 :
453 : /*
454 : * Associate a shared latch with the current process, allowing it to
455 : * wait on the latch.
456 : *
457 : * Although there is a sanity check for latch-already-owned, we don't do
458 : * any sort of locking here, meaning that we could fail to detect the error
459 : * if two processes try to own the same latch at about the same time. If
460 : * there is any risk of that, caller must provide an interlock to prevent it.
461 : */
462 : void
463 37488 : OwnLatch(Latch *latch)
464 : {
465 : int owner_pid;
466 :
467 : /* Sanity checks */
468 : Assert(latch->is_shared);
469 :
470 : #if defined(WAIT_USE_SELF_PIPE)
471 : /* Assert InitializeLatchSupport has been called in this process */
472 : Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
473 : #elif defined(WAIT_USE_SIGNALFD)
474 : /* Assert InitializeLatchSupport has been called in this process */
475 : Assert(signal_fd >= 0);
476 : #endif
477 :
478 37488 : owner_pid = latch->owner_pid;
479 37488 : if (owner_pid != 0)
480 0 : elog(PANIC, "latch already owned by PID %d", owner_pid);
481 :
482 37488 : latch->owner_pid = MyProcPid;
483 37488 : }
484 :
485 : /*
486 : * Disown a shared latch currently owned by the current process.
487 : */
488 : void
489 37382 : DisownLatch(Latch *latch)
490 : {
491 : Assert(latch->is_shared);
492 : Assert(latch->owner_pid == MyProcPid);
493 :
494 37382 : latch->owner_pid = 0;
495 37382 : }
496 :
497 : /*
498 : * Wait for a given latch to be set, or for postmaster death, or until timeout
499 : * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
500 : * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
501 : * function returns immediately.
502 : *
503 : * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
504 : * is given. Although it is declared as "long", we don't actually support
505 : * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
506 : * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
507 : *
508 : * The latch must be owned by the current process, ie. it must be a
509 : * process-local latch initialized with InitLatch, or a shared latch
510 : * associated with the current process by calling OwnLatch.
511 : *
512 : * Returns bit mask indicating which condition(s) caused the wake-up. Note
513 : * that if multiple wake-up conditions are true, there is no guarantee that
514 : * we return all of them in one call, but we will return at least one.
515 : */
516 : int
517 606598 : WaitLatch(Latch *latch, int wakeEvents, long timeout,
518 : uint32 wait_event_info)
519 : {
520 : WaitEvent event;
521 :
522 : /* Postmaster-managed callers must handle postmaster death somehow. */
523 : Assert(!IsUnderPostmaster ||
524 : (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
525 : (wakeEvents & WL_POSTMASTER_DEATH));
526 :
527 : /*
528 : * Some callers may have a latch other than MyLatch, or no latch at all,
529 : * or want to handle postmaster death differently. It's cheap to assign
530 : * those, so just do it every time.
531 : */
532 606598 : if (!(wakeEvents & WL_LATCH_SET))
533 274 : latch = NULL;
534 606598 : ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
535 606598 : LatchWaitSet->exit_on_postmaster_death =
536 606598 : ((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
537 :
538 606598 : if (WaitEventSetWait(LatchWaitSet,
539 606598 : (wakeEvents & WL_TIMEOUT) ? timeout : -1,
540 : &event, 1,
541 : wait_event_info) == 0)
542 42784 : return WL_TIMEOUT;
543 : else
544 563784 : return event.events;
545 : }
546 :
547 : /*
548 : * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
549 : * conditions.
550 : *
551 : * When waiting on a socket, EOF and error conditions always cause the socket
552 : * to be reported as readable/writable/connected, so that the caller can deal
553 : * with the condition.
554 : *
555 : * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
556 : * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
557 : * return value if the postmaster dies. The latter is useful for rare cases
558 : * where some behavior other than immediate exit is needed.
559 : *
560 : * NB: These days this is just a wrapper around the WaitEventSet API. When
561 : * using a latch very frequently, consider creating a longer living
562 : * WaitEventSet instead; that's more efficient.
563 : */
564 : int
565 121932 : WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
566 : long timeout, uint32 wait_event_info)
567 : {
568 121932 : int ret = 0;
569 : int rc;
570 : WaitEvent event;
571 121932 : WaitEventSet *set = CreateWaitEventSet(CurrentResourceOwner, 3);
572 :
573 121932 : if (wakeEvents & WL_TIMEOUT)
574 : Assert(timeout >= 0);
575 : else
576 27738 : timeout = -1;
577 :
578 121932 : if (wakeEvents & WL_LATCH_SET)
579 121530 : AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
580 : latch, NULL);
581 :
582 : /* Postmaster-managed callers must handle postmaster death somehow. */
583 : Assert(!IsUnderPostmaster ||
584 : (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
585 : (wakeEvents & WL_POSTMASTER_DEATH));
586 :
587 121932 : if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
588 0 : AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
589 : NULL, NULL);
590 :
591 121932 : if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
592 121932 : AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
593 : NULL, NULL);
594 :
595 121932 : if (wakeEvents & WL_SOCKET_MASK)
596 : {
597 : int ev;
598 :
599 121932 : ev = wakeEvents & WL_SOCKET_MASK;
600 121932 : AddWaitEventToSet(set, ev, sock, NULL, NULL);
601 : }
602 :
603 121932 : rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
604 :
605 121932 : if (rc == 0)
606 614 : ret |= WL_TIMEOUT;
607 : else
608 : {
609 121318 : ret |= event.events & (WL_LATCH_SET |
610 : WL_POSTMASTER_DEATH |
611 : WL_SOCKET_MASK);
612 : }
613 :
614 121932 : FreeWaitEventSet(set);
615 :
616 121932 : return ret;
617 : }
618 :
619 : /*
620 : * Sets a latch and wakes up anyone waiting on it.
621 : *
622 : * This is cheap if the latch is already set, otherwise not so much.
623 : *
624 : * NB: when calling this in a signal handler, be sure to save and restore
625 : * errno around it. (That's standard practice in most signal handlers, of
626 : * course, but we used to omit it in handlers that only set a flag.)
627 : *
628 : * NB: this function is called from critical sections and signal handlers so
629 : * throwing an error is not a good idea.
630 : */
631 : void
632 1263372 : SetLatch(Latch *latch)
633 : {
634 : #ifndef WIN32
635 : pid_t owner_pid;
636 : #else
637 : HANDLE handle;
638 : #endif
639 :
640 : /*
641 : * The memory barrier has to be placed here to ensure that any flag
642 : * variables possibly changed by this process have been flushed to main
643 : * memory, before we check/set is_set.
644 : */
645 1263372 : pg_memory_barrier();
646 :
647 : /* Quick exit if already set */
648 1263372 : if (latch->is_set)
649 326344 : return;
650 :
651 937028 : latch->is_set = true;
652 :
653 937028 : pg_memory_barrier();
654 937028 : if (!latch->maybe_sleeping)
655 216288 : return;
656 :
657 : #ifndef WIN32
658 :
659 : /*
660 : * See if anyone's waiting for the latch. It can be the current process if
661 : * we're in a signal handler. We use the self-pipe or SIGURG to ourselves
662 : * to wake up WaitEventSetWaitBlock() without races in that case. If it's
663 : * another process, send a signal.
664 : *
665 : * Fetch owner_pid only once, in case the latch is concurrently getting
666 : * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
667 : * guaranteed to be true! In practice, the effective range of pid_t fits
668 : * in a 32 bit integer, and so should be atomic. In the worst case, we
669 : * might end up signaling the wrong process. Even then, you're very
670 : * unlucky if a process with that bogus pid exists and belongs to
671 : * Postgres; and PG database processes should handle excess SIGUSR1
672 : * interrupts without a problem anyhow.
673 : *
674 : * Another sort of race condition that's possible here is for a new
675 : * process to own the latch immediately after we look, so we don't signal
676 : * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
677 : * the standard coding convention of waiting at the bottom of their loops,
678 : * not the top, so that they'll correctly process latch-setting events
679 : * that happen before they enter the loop.
680 : */
681 720740 : owner_pid = latch->owner_pid;
682 720740 : if (owner_pid == 0)
683 0 : return;
684 720740 : else if (owner_pid == MyProcPid)
685 : {
686 : #if defined(WAIT_USE_SELF_PIPE)
687 : if (waiting)
688 : sendSelfPipeByte();
689 : #else
690 233058 : if (waiting)
691 233058 : kill(MyProcPid, SIGURG);
692 : #endif
693 : }
694 : else
695 487682 : kill(owner_pid, SIGURG);
696 :
697 : #else
698 :
699 : /*
700 : * See if anyone's waiting for the latch. It can be the current process if
701 : * we're in a signal handler.
702 : *
703 : * Use a local variable here just in case somebody changes the event field
704 : * concurrently (which really should not happen).
705 : */
706 : handle = latch->event;
707 : if (handle)
708 : {
709 : SetEvent(handle);
710 :
711 : /*
712 : * Note that we silently ignore any errors. We might be in a signal
713 : * handler or other critical path where it's not safe to call elog().
714 : */
715 : }
716 : #endif
717 : }
718 :
719 : /*
720 : * Clear the latch. Calling WaitLatch after this will sleep, unless
721 : * the latch is set again before the WaitLatch call.
722 : */
723 : void
724 2899732 : ResetLatch(Latch *latch)
725 : {
726 : /* Only the owner should reset the latch */
727 : Assert(latch->owner_pid == MyProcPid);
728 : Assert(latch->maybe_sleeping == false);
729 :
730 2899732 : latch->is_set = false;
731 :
732 : /*
733 : * Ensure that the write to is_set gets flushed to main memory before we
734 : * examine any flag variables. Otherwise a concurrent SetLatch might
735 : * falsely conclude that it needn't signal us, even though we have missed
736 : * seeing some flag updates that SetLatch was supposed to inform us of.
737 : */
738 2899732 : pg_memory_barrier();
739 2899732 : }
740 :
741 : /*
742 : * Create a WaitEventSet with space for nevents different events to wait for.
743 : *
744 : * These events can then be efficiently waited upon together, using
745 : * WaitEventSetWait().
746 : *
747 : * The WaitEventSet is tracked by the given 'resowner'. Use NULL for session
748 : * lifetime.
749 : */
750 : WaitEventSet *
751 188716 : CreateWaitEventSet(ResourceOwner resowner, int nevents)
752 : {
753 : WaitEventSet *set;
754 : char *data;
755 188716 : Size sz = 0;
756 :
757 : /*
758 : * Use MAXALIGN size/alignment to guarantee that later uses of memory are
759 : * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
760 : * platforms, but earlier allocations like WaitEventSet and WaitEvent
761 : * might not be sized to guarantee that when purely using sizeof().
762 : */
763 188716 : sz += MAXALIGN(sizeof(WaitEventSet));
764 188716 : sz += MAXALIGN(sizeof(WaitEvent) * nevents);
765 :
766 : #if defined(WAIT_USE_EPOLL)
767 188716 : sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
768 : #elif defined(WAIT_USE_KQUEUE)
769 : sz += MAXALIGN(sizeof(struct kevent) * nevents);
770 : #elif defined(WAIT_USE_POLL)
771 : sz += MAXALIGN(sizeof(struct pollfd) * nevents);
772 : #elif defined(WAIT_USE_WIN32)
773 : /* need space for the pgwin32_signal_event */
774 : sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
775 : #endif
776 :
777 188716 : if (resowner != NULL)
778 87398 : ResourceOwnerEnlarge(resowner);
779 :
780 188716 : data = (char *) MemoryContextAllocZero(TopMemoryContext, sz);
781 :
782 188716 : set = (WaitEventSet *) data;
783 188716 : data += MAXALIGN(sizeof(WaitEventSet));
784 :
785 188716 : set->events = (WaitEvent *) data;
786 188716 : data += MAXALIGN(sizeof(WaitEvent) * nevents);
787 :
788 : #if defined(WAIT_USE_EPOLL)
789 188716 : set->epoll_ret_events = (struct epoll_event *) data;
790 188716 : data += MAXALIGN(sizeof(struct epoll_event) * nevents);
791 : #elif defined(WAIT_USE_KQUEUE)
792 : set->kqueue_ret_events = (struct kevent *) data;
793 : data += MAXALIGN(sizeof(struct kevent) * nevents);
794 : #elif defined(WAIT_USE_POLL)
795 : set->pollfds = (struct pollfd *) data;
796 : data += MAXALIGN(sizeof(struct pollfd) * nevents);
797 : #elif defined(WAIT_USE_WIN32)
798 : set->handles = (HANDLE) data;
799 : data += MAXALIGN(sizeof(HANDLE) * nevents);
800 : #endif
801 :
802 188716 : set->latch = NULL;
803 188716 : set->nevents_space = nevents;
804 188716 : set->exit_on_postmaster_death = false;
805 :
806 188716 : if (resowner != NULL)
807 : {
808 87398 : ResourceOwnerRememberWaitEventSet(resowner, set);
809 87398 : set->owner = resowner;
810 : }
811 :
812 : #if defined(WAIT_USE_EPOLL)
813 188716 : if (!AcquireExternalFD())
814 0 : elog(ERROR, "AcquireExternalFD, for epoll_create1, failed: %m");
815 188716 : set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
816 188716 : if (set->epoll_fd < 0)
817 : {
818 0 : ReleaseExternalFD();
819 0 : elog(ERROR, "epoll_create1 failed: %m");
820 : }
821 : #elif defined(WAIT_USE_KQUEUE)
822 : if (!AcquireExternalFD())
823 : elog(ERROR, "AcquireExternalFD, for kqueue, failed: %m");
824 : set->kqueue_fd = kqueue();
825 : if (set->kqueue_fd < 0)
826 : {
827 : ReleaseExternalFD();
828 : elog(ERROR, "kqueue failed: %m");
829 : }
830 : if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
831 : {
832 : int save_errno = errno;
833 :
834 : close(set->kqueue_fd);
835 : ReleaseExternalFD();
836 : errno = save_errno;
837 : elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
838 : }
839 : set->report_postmaster_not_running = false;
840 : #elif defined(WAIT_USE_WIN32)
841 :
842 : /*
843 : * To handle signals while waiting, we need to add a win32 specific event.
844 : * We accounted for the additional event at the top of this routine. See
845 : * port/win32/signal.c for more details.
846 : *
847 : * Note: pgwin32_signal_event should be first to ensure that it will be
848 : * reported when multiple events are set. We want to guarantee that
849 : * pending signals are serviced.
850 : */
851 : set->handles[0] = pgwin32_signal_event;
852 : StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
853 : #endif
854 :
855 188716 : return set;
856 : }
857 :
858 : /*
859 : * Free a previously created WaitEventSet.
860 : *
861 : * Note: preferably, this shouldn't have to free any resources that could be
862 : * inherited across an exec(). If it did, we'd likely leak those resources in
863 : * many scenarios. For the epoll case, we ensure that by setting EPOLL_CLOEXEC
864 : * when the FD is created. For the Windows case, we assume that the handles
865 : * involved are non-inheritable.
866 : */
867 : void
868 123682 : FreeWaitEventSet(WaitEventSet *set)
869 : {
870 123682 : if (set->owner)
871 : {
872 87396 : ResourceOwnerForgetWaitEventSet(set->owner, set);
873 87396 : set->owner = NULL;
874 : }
875 :
876 : #if defined(WAIT_USE_EPOLL)
877 123682 : close(set->epoll_fd);
878 123682 : ReleaseExternalFD();
879 : #elif defined(WAIT_USE_KQUEUE)
880 : close(set->kqueue_fd);
881 : ReleaseExternalFD();
882 : #elif defined(WAIT_USE_WIN32)
883 : for (WaitEvent *cur_event = set->events;
884 : cur_event < (set->events + set->nevents);
885 : cur_event++)
886 : {
887 : if (cur_event->events & WL_LATCH_SET)
888 : {
889 : /* uses the latch's HANDLE */
890 : }
891 : else if (cur_event->events & WL_POSTMASTER_DEATH)
892 : {
893 : /* uses PostmasterHandle */
894 : }
895 : else
896 : {
897 : /* Clean up the event object we created for the socket */
898 : WSAEventSelect(cur_event->fd, NULL, 0);
899 : WSACloseEvent(set->handles[cur_event->pos + 1]);
900 : }
901 : }
902 : #endif
903 :
904 123682 : pfree(set);
905 123682 : }
906 :
907 : /*
908 : * Free a previously created WaitEventSet in a child process after a fork().
909 : */
910 : void
911 34170 : FreeWaitEventSetAfterFork(WaitEventSet *set)
912 : {
913 : #if defined(WAIT_USE_EPOLL)
914 34170 : close(set->epoll_fd);
915 34170 : ReleaseExternalFD();
916 : #elif defined(WAIT_USE_KQUEUE)
917 : /* kqueues are not normally inherited by child processes */
918 : ReleaseExternalFD();
919 : #endif
920 :
921 34170 : pfree(set);
922 34170 : }
923 :
924 : /* ---
925 : * Add an event to the set. Possible events are:
926 : * - WL_LATCH_SET: Wait for the latch to be set
927 : * - WL_POSTMASTER_DEATH: Wait for postmaster to die
928 : * - WL_SOCKET_READABLE: Wait for socket to become readable,
929 : * can be combined in one event with other WL_SOCKET_* events
930 : * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
931 : * can be combined with other WL_SOCKET_* events
932 : * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
933 : * can be combined with other WL_SOCKET_* events (on non-Windows
934 : * platforms, this is the same as WL_SOCKET_WRITEABLE)
935 : * - WL_SOCKET_ACCEPT: Wait for new connection to a server socket,
936 : * can be combined with other WL_SOCKET_* events (on non-Windows
937 : * platforms, this is the same as WL_SOCKET_READABLE)
938 : * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
939 : * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
940 : *
941 : * Returns the offset in WaitEventSet->events (starting from 0), which can be
942 : * used to modify previously added wait events using ModifyWaitEvent().
943 : *
944 : * In the WL_LATCH_SET case the latch must be owned by the current process,
945 : * i.e. it must be a process-local latch initialized with InitLatch, or a
946 : * shared latch associated with the current process by calling OwnLatch.
947 : *
948 : * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED/ACCEPT cases, EOF and error
949 : * conditions cause the socket to be reported as readable/writable/connected,
950 : * so that the caller can deal with the condition.
951 : *
952 : * The user_data pointer specified here will be set for the events returned
953 : * by WaitEventSetWait(), allowing to easily associate additional data with
954 : * events.
955 : */
956 : int
957 522844 : AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
958 : void *user_data)
959 : {
960 : WaitEvent *event;
961 :
962 : /* not enough space */
963 : Assert(set->nevents < set->nevents_space);
964 :
965 522844 : if (events == WL_EXIT_ON_PM_DEATH)
966 : {
967 159544 : events = WL_POSTMASTER_DEATH;
968 159544 : set->exit_on_postmaster_death = true;
969 : }
970 :
971 522844 : if (latch)
972 : {
973 188108 : if (latch->owner_pid != MyProcPid)
974 0 : elog(ERROR, "cannot wait on a latch owned by another process");
975 188108 : if (set->latch)
976 0 : elog(ERROR, "cannot wait on more than one latch");
977 188108 : if ((events & WL_LATCH_SET) != WL_LATCH_SET)
978 0 : elog(ERROR, "latch events only support being set");
979 : }
980 : else
981 : {
982 334736 : if (events & WL_LATCH_SET)
983 0 : elog(ERROR, "cannot wait on latch without a specified latch");
984 : }
985 :
986 : /* waiting for socket readiness without a socket indicates a bug */
987 522844 : if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
988 0 : elog(ERROR, "cannot wait on socket event without a socket");
989 :
990 522844 : event = &set->events[set->nevents];
991 522844 : event->pos = set->nevents++;
992 522844 : event->fd = fd;
993 522844 : event->events = events;
994 522844 : event->user_data = user_data;
995 : #ifdef WIN32
996 : event->reset = false;
997 : #endif
998 :
999 522844 : if (events == WL_LATCH_SET)
1000 : {
1001 188108 : set->latch = latch;
1002 188108 : set->latch_pos = event->pos;
1003 : #if defined(WAIT_USE_SELF_PIPE)
1004 : event->fd = selfpipe_readfd;
1005 : #elif defined(WAIT_USE_SIGNALFD)
1006 188108 : event->fd = signal_fd;
1007 : #else
1008 : event->fd = PGINVALID_SOCKET;
1009 : #ifdef WAIT_USE_EPOLL
1010 : return event->pos;
1011 : #endif
1012 : #endif
1013 : }
1014 334736 : else if (events == WL_POSTMASTER_DEATH)
1015 : {
1016 : #ifndef WIN32
1017 185208 : event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
1018 : #endif
1019 : }
1020 :
1021 : /* perform wait primitive specific initialization, if needed */
1022 : #if defined(WAIT_USE_EPOLL)
1023 522844 : WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
1024 : #elif defined(WAIT_USE_KQUEUE)
1025 : WaitEventAdjustKqueue(set, event, 0);
1026 : #elif defined(WAIT_USE_POLL)
1027 : WaitEventAdjustPoll(set, event);
1028 : #elif defined(WAIT_USE_WIN32)
1029 : WaitEventAdjustWin32(set, event);
1030 : #endif
1031 :
1032 522844 : return event->pos;
1033 : }
1034 :
1035 : /*
1036 : * Change the event mask and, in the WL_LATCH_SET case, the latch associated
1037 : * with the WaitEvent. The latch may be changed to NULL to disable the latch
1038 : * temporarily, and then set back to a latch later.
1039 : *
1040 : * 'pos' is the id returned by AddWaitEventToSet.
1041 : */
1042 : void
1043 1088224 : ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
1044 : {
1045 : WaitEvent *event;
1046 : #if defined(WAIT_USE_KQUEUE)
1047 : int old_events;
1048 : #endif
1049 :
1050 : Assert(pos < set->nevents);
1051 :
1052 1088224 : event = &set->events[pos];
1053 : #if defined(WAIT_USE_KQUEUE)
1054 : old_events = event->events;
1055 : #endif
1056 :
1057 : /*
1058 : * If neither the event mask nor the associated latch changes, return
1059 : * early. That's an important optimization for some sockets, where
1060 : * ModifyWaitEvent is frequently used to switch from waiting for reads to
1061 : * waiting on writes.
1062 : */
1063 1088224 : if (events == event->events &&
1064 1063728 : (!(event->events & WL_LATCH_SET) || set->latch == latch))
1065 1005940 : return;
1066 :
1067 82284 : if (event->events & WL_LATCH_SET &&
1068 57788 : events != event->events)
1069 : {
1070 0 : elog(ERROR, "cannot modify latch event");
1071 : }
1072 :
1073 82284 : if (event->events & WL_POSTMASTER_DEATH)
1074 : {
1075 0 : elog(ERROR, "cannot modify postmaster death event");
1076 : }
1077 :
1078 : /* FIXME: validate event mask */
1079 82284 : event->events = events;
1080 :
1081 82284 : if (events == WL_LATCH_SET)
1082 : {
1083 57788 : if (latch && latch->owner_pid != MyProcPid)
1084 0 : elog(ERROR, "cannot wait on a latch owned by another process");
1085 57788 : set->latch = latch;
1086 :
1087 : /*
1088 : * On Unix, we don't need to modify the kernel object because the
1089 : * underlying pipe (if there is one) is the same for all latches so we
1090 : * can return immediately. On Windows, we need to update our array of
1091 : * handles, but we leave the old one in place and tolerate spurious
1092 : * wakeups if the latch is disabled.
1093 : */
1094 : #if defined(WAIT_USE_WIN32)
1095 : if (!latch)
1096 : return;
1097 : #else
1098 57788 : return;
1099 : #endif
1100 : }
1101 :
1102 : #if defined(WAIT_USE_EPOLL)
1103 24496 : WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
1104 : #elif defined(WAIT_USE_KQUEUE)
1105 : WaitEventAdjustKqueue(set, event, old_events);
1106 : #elif defined(WAIT_USE_POLL)
1107 : WaitEventAdjustPoll(set, event);
1108 : #elif defined(WAIT_USE_WIN32)
1109 : WaitEventAdjustWin32(set, event);
1110 : #endif
1111 : }
1112 :
1113 : #if defined(WAIT_USE_EPOLL)
1114 : /*
1115 : * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
1116 : */
1117 : static void
1118 547340 : WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
1119 : {
1120 : struct epoll_event epoll_ev;
1121 : int rc;
1122 :
1123 : /* pointer to our event, returned by epoll_wait */
1124 547340 : epoll_ev.data.ptr = event;
1125 : /* always wait for errors */
1126 547340 : epoll_ev.events = EPOLLERR | EPOLLHUP;
1127 :
1128 : /* prepare pollfd entry once */
1129 547340 : if (event->events == WL_LATCH_SET)
1130 : {
1131 : Assert(set->latch != NULL);
1132 188108 : epoll_ev.events |= EPOLLIN;
1133 : }
1134 359232 : else if (event->events == WL_POSTMASTER_DEATH)
1135 : {
1136 185208 : epoll_ev.events |= EPOLLIN;
1137 : }
1138 : else
1139 : {
1140 : Assert(event->fd != PGINVALID_SOCKET);
1141 : Assert(event->events & (WL_SOCKET_READABLE |
1142 : WL_SOCKET_WRITEABLE |
1143 : WL_SOCKET_CLOSED));
1144 :
1145 174024 : if (event->events & WL_SOCKET_READABLE)
1146 145458 : epoll_ev.events |= EPOLLIN;
1147 174024 : if (event->events & WL_SOCKET_WRITEABLE)
1148 29704 : epoll_ev.events |= EPOLLOUT;
1149 174024 : if (event->events & WL_SOCKET_CLOSED)
1150 0 : epoll_ev.events |= EPOLLRDHUP;
1151 : }
1152 :
1153 : /*
1154 : * Even though unused, we also pass epoll_ev as the data argument if
1155 : * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
1156 : * requiring that, and actually it makes the code simpler...
1157 : */
1158 547340 : rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
1159 :
1160 547340 : if (rc < 0)
1161 0 : ereport(ERROR,
1162 : (errcode_for_socket_access(),
1163 : errmsg("%s() failed: %m",
1164 : "epoll_ctl")));
1165 547340 : }
1166 : #endif
1167 :
1168 : #if defined(WAIT_USE_POLL)
1169 : static void
1170 : WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
1171 : {
1172 : struct pollfd *pollfd = &set->pollfds[event->pos];
1173 :
1174 : pollfd->revents = 0;
1175 : pollfd->fd = event->fd;
1176 :
1177 : /* prepare pollfd entry once */
1178 : if (event->events == WL_LATCH_SET)
1179 : {
1180 : Assert(set->latch != NULL);
1181 : pollfd->events = POLLIN;
1182 : }
1183 : else if (event->events == WL_POSTMASTER_DEATH)
1184 : {
1185 : pollfd->events = POLLIN;
1186 : }
1187 : else
1188 : {
1189 : Assert(event->events & (WL_SOCKET_READABLE |
1190 : WL_SOCKET_WRITEABLE |
1191 : WL_SOCKET_CLOSED));
1192 : pollfd->events = 0;
1193 : if (event->events & WL_SOCKET_READABLE)
1194 : pollfd->events |= POLLIN;
1195 : if (event->events & WL_SOCKET_WRITEABLE)
1196 : pollfd->events |= POLLOUT;
1197 : #ifdef POLLRDHUP
1198 : if (event->events & WL_SOCKET_CLOSED)
1199 : pollfd->events |= POLLRDHUP;
1200 : #endif
1201 : }
1202 :
1203 : Assert(event->fd != PGINVALID_SOCKET);
1204 : }
1205 : #endif
1206 :
1207 : #if defined(WAIT_USE_KQUEUE)
1208 :
1209 : /*
1210 : * On most BSD family systems, the udata member of struct kevent is of type
1211 : * void *, so we could directly convert to/from WaitEvent *. Unfortunately,
1212 : * NetBSD has it as intptr_t, so here we wallpaper over that difference with
1213 : * an lvalue cast.
1214 : */
1215 : #define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
1216 :
1217 : static inline void
1218 : WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
1219 : WaitEvent *event)
1220 : {
1221 : k_ev->ident = event->fd;
1222 : k_ev->filter = filter;
1223 : k_ev->flags = action;
1224 : k_ev->fflags = 0;
1225 : k_ev->data = 0;
1226 : AccessWaitEvent(k_ev) = event;
1227 : }
1228 :
1229 : static inline void
1230 : WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
1231 : {
1232 : /* For now postmaster death can only be added, not removed. */
1233 : k_ev->ident = PostmasterPid;
1234 : k_ev->filter = EVFILT_PROC;
1235 : k_ev->flags = EV_ADD;
1236 : k_ev->fflags = NOTE_EXIT;
1237 : k_ev->data = 0;
1238 : AccessWaitEvent(k_ev) = event;
1239 : }
1240 :
1241 : static inline void
1242 : WaitEventAdjustKqueueAddLatch(struct kevent *k_ev, WaitEvent *event)
1243 : {
1244 : /* For now latch can only be added, not removed. */
1245 : k_ev->ident = SIGURG;
1246 : k_ev->filter = EVFILT_SIGNAL;
1247 : k_ev->flags = EV_ADD;
1248 : k_ev->fflags = 0;
1249 : k_ev->data = 0;
1250 : AccessWaitEvent(k_ev) = event;
1251 : }
1252 :
1253 : /*
1254 : * old_events is the previous event mask, used to compute what has changed.
1255 : */
1256 : static void
1257 : WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
1258 : {
1259 : int rc;
1260 : struct kevent k_ev[2];
1261 : int count = 0;
1262 : bool new_filt_read = false;
1263 : bool old_filt_read = false;
1264 : bool new_filt_write = false;
1265 : bool old_filt_write = false;
1266 :
1267 : if (old_events == event->events)
1268 : return;
1269 :
1270 : Assert(event->events != WL_LATCH_SET || set->latch != NULL);
1271 : Assert(event->events == WL_LATCH_SET ||
1272 : event->events == WL_POSTMASTER_DEATH ||
1273 : (event->events & (WL_SOCKET_READABLE |
1274 : WL_SOCKET_WRITEABLE |
1275 : WL_SOCKET_CLOSED)));
1276 :
1277 : if (event->events == WL_POSTMASTER_DEATH)
1278 : {
1279 : /*
1280 : * Unlike all the other implementations, we detect postmaster death
1281 : * using process notification instead of waiting on the postmaster
1282 : * alive pipe.
1283 : */
1284 : WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
1285 : }
1286 : else if (event->events == WL_LATCH_SET)
1287 : {
1288 : /* We detect latch wakeup using a signal event. */
1289 : WaitEventAdjustKqueueAddLatch(&k_ev[count++], event);
1290 : }
1291 : else
1292 : {
1293 : /*
1294 : * We need to compute the adds and deletes required to get from the
1295 : * old event mask to the new event mask, since kevent treats readable
1296 : * and writable as separate events.
1297 : */
1298 : if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
1299 : old_filt_read = true;
1300 : if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
1301 : new_filt_read = true;
1302 : if (old_events & WL_SOCKET_WRITEABLE)
1303 : old_filt_write = true;
1304 : if (event->events & WL_SOCKET_WRITEABLE)
1305 : new_filt_write = true;
1306 : if (old_filt_read && !new_filt_read)
1307 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
1308 : event);
1309 : else if (!old_filt_read && new_filt_read)
1310 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
1311 : event);
1312 : if (old_filt_write && !new_filt_write)
1313 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
1314 : event);
1315 : else if (!old_filt_write && new_filt_write)
1316 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
1317 : event);
1318 : }
1319 :
1320 : /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
1321 : if (count == 0)
1322 : return;
1323 :
1324 : Assert(count <= 2);
1325 :
1326 : rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
1327 :
1328 : /*
1329 : * When adding the postmaster's pid, we have to consider that it might
1330 : * already have exited and perhaps even been replaced by another process
1331 : * with the same pid. If so, we have to defer reporting this as an event
1332 : * until the next call to WaitEventSetWaitBlock().
1333 : */
1334 :
1335 : if (rc < 0)
1336 : {
1337 : if (event->events == WL_POSTMASTER_DEATH &&
1338 : (errno == ESRCH || errno == EACCES))
1339 : set->report_postmaster_not_running = true;
1340 : else
1341 : ereport(ERROR,
1342 : (errcode_for_socket_access(),
1343 : errmsg("%s() failed: %m",
1344 : "kevent")));
1345 : }
1346 : else if (event->events == WL_POSTMASTER_DEATH &&
1347 : PostmasterPid != getppid() &&
1348 : !PostmasterIsAlive())
1349 : {
1350 : /*
1351 : * The extra PostmasterIsAliveInternal() check prevents false alarms
1352 : * on systems that give a different value for getppid() while being
1353 : * traced by a debugger.
1354 : */
1355 : set->report_postmaster_not_running = true;
1356 : }
1357 : }
1358 :
1359 : #endif
1360 :
1361 : #if defined(WAIT_USE_WIN32)
1362 : static void
1363 : WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
1364 : {
1365 : HANDLE *handle = &set->handles[event->pos + 1];
1366 :
1367 : if (event->events == WL_LATCH_SET)
1368 : {
1369 : Assert(set->latch != NULL);
1370 : *handle = set->latch->event;
1371 : }
1372 : else if (event->events == WL_POSTMASTER_DEATH)
1373 : {
1374 : *handle = PostmasterHandle;
1375 : }
1376 : else
1377 : {
1378 : int flags = FD_CLOSE; /* always check for errors/EOF */
1379 :
1380 : if (event->events & WL_SOCKET_READABLE)
1381 : flags |= FD_READ;
1382 : if (event->events & WL_SOCKET_WRITEABLE)
1383 : flags |= FD_WRITE;
1384 : if (event->events & WL_SOCKET_CONNECTED)
1385 : flags |= FD_CONNECT;
1386 : if (event->events & WL_SOCKET_ACCEPT)
1387 : flags |= FD_ACCEPT;
1388 :
1389 : if (*handle == WSA_INVALID_EVENT)
1390 : {
1391 : *handle = WSACreateEvent();
1392 : if (*handle == WSA_INVALID_EVENT)
1393 : elog(ERROR, "failed to create event for socket: error code %d",
1394 : WSAGetLastError());
1395 : }
1396 : if (WSAEventSelect(event->fd, *handle, flags) != 0)
1397 : elog(ERROR, "failed to set up event for socket: error code %d",
1398 : WSAGetLastError());
1399 :
1400 : Assert(event->fd != PGINVALID_SOCKET);
1401 : }
1402 : }
1403 : #endif
1404 :
1405 : /*
1406 : * Wait for events added to the set to happen, or until the timeout is
1407 : * reached. At most nevents occurred events are returned.
1408 : *
1409 : * If timeout = -1, block until an event occurs; if 0, check sockets for
1410 : * readiness, but don't block; if > 0, block for at most timeout milliseconds.
1411 : *
1412 : * Returns the number of events occurred, or 0 if the timeout was reached.
1413 : *
1414 : * Returned events will have the fd, pos, user_data fields set to the
1415 : * values associated with the registered event.
1416 : */
1417 : int
1418 1406794 : WaitEventSetWait(WaitEventSet *set, long timeout,
1419 : WaitEvent *occurred_events, int nevents,
1420 : uint32 wait_event_info)
1421 : {
1422 1406794 : int returned_events = 0;
1423 : instr_time start_time;
1424 : instr_time cur_time;
1425 1406794 : long cur_timeout = -1;
1426 :
1427 : Assert(nevents > 0);
1428 :
1429 : /*
1430 : * Initialize timeout if requested. We must record the current time so
1431 : * that we can determine the remaining timeout if interrupted.
1432 : */
1433 1406794 : if (timeout >= 0)
1434 : {
1435 518278 : INSTR_TIME_SET_CURRENT(start_time);
1436 : Assert(timeout >= 0 && timeout <= INT_MAX);
1437 518278 : cur_timeout = timeout;
1438 : }
1439 : else
1440 888516 : INSTR_TIME_SET_ZERO(start_time);
1441 :
1442 1406794 : pgstat_report_wait_start(wait_event_info);
1443 :
1444 : #ifndef WIN32
1445 1406794 : waiting = true;
1446 : #else
1447 : /* Ensure that signals are serviced even if latch is already set */
1448 : pgwin32_dispatch_queued_signals();
1449 : #endif
1450 2849678 : while (returned_events == 0)
1451 : {
1452 : int rc;
1453 :
1454 : /*
1455 : * Check if the latch is set already. If so, leave the loop
1456 : * immediately, avoid blocking again. We don't attempt to report any
1457 : * other events that might also be satisfied.
1458 : *
1459 : * If someone sets the latch between this and the
1460 : * WaitEventSetWaitBlock() below, the setter will write a byte to the
1461 : * pipe (or signal us and the signal handler will do that), and the
1462 : * readiness routine will return immediately.
1463 : *
1464 : * On unix, If there's a pending byte in the self pipe, we'll notice
1465 : * whenever blocking. Only clearing the pipe in that case avoids
1466 : * having to drain it every time WaitLatchOrSocket() is used. Should
1467 : * the pipe-buffer fill up we're still ok, because the pipe is in
1468 : * nonblocking mode. It's unlikely for that to happen, because the
1469 : * self pipe isn't filled unless we're blocking (waiting = true), or
1470 : * from inside a signal handler in latch_sigurg_handler().
1471 : *
1472 : * On windows, we'll also notice if there's a pending event for the
1473 : * latch when blocking, but there's no danger of anything filling up,
1474 : * as "Setting an event that is already set has no effect.".
1475 : *
1476 : * Note: we assume that the kernel calls involved in latch management
1477 : * will provide adequate synchronization on machines with weak memory
1478 : * ordering, so that we cannot miss seeing is_set if a notification
1479 : * has already been queued.
1480 : */
1481 1865912 : if (set->latch && !set->latch->is_set)
1482 : {
1483 : /* about to sleep on a latch */
1484 1485520 : set->latch->maybe_sleeping = true;
1485 1485520 : pg_memory_barrier();
1486 : /* and recheck */
1487 : }
1488 :
1489 1865912 : if (set->latch && set->latch->is_set)
1490 : {
1491 379598 : occurred_events->fd = PGINVALID_SOCKET;
1492 379598 : occurred_events->pos = set->latch_pos;
1493 379598 : occurred_events->user_data =
1494 379598 : set->events[set->latch_pos].user_data;
1495 379598 : occurred_events->events = WL_LATCH_SET;
1496 379598 : occurred_events++;
1497 379598 : returned_events++;
1498 :
1499 : /* could have been set above */
1500 379598 : set->latch->maybe_sleeping = false;
1501 :
1502 379598 : break;
1503 : }
1504 :
1505 : /*
1506 : * Wait for events using the readiness primitive chosen at the top of
1507 : * this file. If -1 is returned, a timeout has occurred, if 0 we have
1508 : * to retry, everything >= 1 is the number of returned events.
1509 : */
1510 1486314 : rc = WaitEventSetWaitBlock(set, cur_timeout,
1511 : occurred_events, nevents);
1512 :
1513 1486284 : if (set->latch)
1514 : {
1515 : Assert(set->latch->maybe_sleeping);
1516 1485406 : set->latch->maybe_sleeping = false;
1517 : }
1518 :
1519 1486284 : if (rc == -1)
1520 43396 : break; /* timeout occurred */
1521 : else
1522 1442888 : returned_events = rc;
1523 :
1524 : /* If we're not done, update cur_timeout for next iteration */
1525 1442888 : if (returned_events == 0 && timeout >= 0)
1526 : {
1527 444284 : INSTR_TIME_SET_CURRENT(cur_time);
1528 444284 : INSTR_TIME_SUBTRACT(cur_time, start_time);
1529 444284 : cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1530 444284 : if (cur_timeout <= 0)
1531 4 : break;
1532 : }
1533 : }
1534 : #ifndef WIN32
1535 1406764 : waiting = false;
1536 : #endif
1537 :
1538 1406764 : pgstat_report_wait_end();
1539 :
1540 1406764 : return returned_events;
1541 : }
1542 :
1543 :
1544 : #if defined(WAIT_USE_EPOLL)
1545 :
1546 : /*
1547 : * Wait using linux's epoll_wait(2).
1548 : *
1549 : * This is the preferable wait method, as several readiness notifications are
1550 : * delivered, without having to iterate through all of set->events. The return
1551 : * epoll_event struct contain a pointer to our events, making association
1552 : * easy.
1553 : */
1554 : static inline int
1555 1486314 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1556 : WaitEvent *occurred_events, int nevents)
1557 : {
1558 1486314 : int returned_events = 0;
1559 : int rc;
1560 : WaitEvent *cur_event;
1561 : struct epoll_event *cur_epoll_event;
1562 :
1563 : /* Sleep */
1564 1486314 : rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1565 1486314 : Min(nevents, set->nevents_space), cur_timeout);
1566 :
1567 : /* Check return code */
1568 1486314 : if (rc < 0)
1569 : {
1570 : /* EINTR is okay, otherwise complain */
1571 232890 : if (errno != EINTR)
1572 : {
1573 0 : waiting = false;
1574 0 : ereport(ERROR,
1575 : (errcode_for_socket_access(),
1576 : errmsg("%s() failed: %m",
1577 : "epoll_wait")));
1578 : }
1579 232890 : return 0;
1580 : }
1581 1253424 : else if (rc == 0)
1582 : {
1583 : /* timeout exceeded */
1584 43396 : return -1;
1585 : }
1586 :
1587 : /*
1588 : * At least one event occurred, iterate over the returned epoll events
1589 : * until they're either all processed, or we've returned all the events
1590 : * the caller desired.
1591 : */
1592 1210028 : for (cur_epoll_event = set->epoll_ret_events;
1593 2421280 : cur_epoll_event < (set->epoll_ret_events + rc) &&
1594 : returned_events < nevents;
1595 1211252 : cur_epoll_event++)
1596 : {
1597 : /* epoll's data pointer is set to the associated WaitEvent */
1598 1211282 : cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1599 :
1600 1211282 : occurred_events->pos = cur_event->pos;
1601 1211282 : occurred_events->user_data = cur_event->user_data;
1602 1211282 : occurred_events->events = 0;
1603 :
1604 1211282 : if (cur_event->events == WL_LATCH_SET &&
1605 712060 : cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1606 : {
1607 : /* Drain the signalfd. */
1608 712060 : drain();
1609 :
1610 712060 : if (set->latch && set->latch->is_set)
1611 : {
1612 484668 : occurred_events->fd = PGINVALID_SOCKET;
1613 484668 : occurred_events->events = WL_LATCH_SET;
1614 484668 : occurred_events++;
1615 484668 : returned_events++;
1616 : }
1617 : }
1618 499222 : else if (cur_event->events == WL_POSTMASTER_DEATH &&
1619 30 : cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1620 : {
1621 : /*
1622 : * We expect an EPOLLHUP when the remote end is closed, but
1623 : * because we don't expect the pipe to become readable or to have
1624 : * any errors either, treat those cases as postmaster death, too.
1625 : *
1626 : * Be paranoid about a spurious event signaling the postmaster as
1627 : * being dead. There have been reports about that happening with
1628 : * older primitives (select(2) to be specific), and a spurious
1629 : * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1630 : * cost much.
1631 : */
1632 30 : if (!PostmasterIsAliveInternal())
1633 : {
1634 30 : if (set->exit_on_postmaster_death)
1635 30 : proc_exit(1);
1636 0 : occurred_events->fd = PGINVALID_SOCKET;
1637 0 : occurred_events->events = WL_POSTMASTER_DEATH;
1638 0 : occurred_events++;
1639 0 : returned_events++;
1640 : }
1641 : }
1642 499192 : else if (cur_event->events & (WL_SOCKET_READABLE |
1643 : WL_SOCKET_WRITEABLE |
1644 : WL_SOCKET_CLOSED))
1645 : {
1646 : Assert(cur_event->fd != PGINVALID_SOCKET);
1647 :
1648 499192 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1649 479564 : (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1650 : {
1651 : /* data available in socket, or EOF */
1652 448682 : occurred_events->events |= WL_SOCKET_READABLE;
1653 : }
1654 :
1655 499192 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1656 55294 : (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1657 : {
1658 : /* writable, or EOF */
1659 54276 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1660 : }
1661 :
1662 499192 : if ((cur_event->events & WL_SOCKET_CLOSED) &&
1663 0 : (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
1664 : {
1665 : /* remote peer shut down, or error */
1666 0 : occurred_events->events |= WL_SOCKET_CLOSED;
1667 : }
1668 :
1669 499192 : if (occurred_events->events != 0)
1670 : {
1671 499192 : occurred_events->fd = cur_event->fd;
1672 499192 : occurred_events++;
1673 499192 : returned_events++;
1674 : }
1675 : }
1676 : }
1677 :
1678 1209998 : return returned_events;
1679 : }
1680 :
1681 : #elif defined(WAIT_USE_KQUEUE)
1682 :
1683 : /*
1684 : * Wait using kevent(2) on BSD-family systems and macOS.
1685 : *
1686 : * For now this mirrors the epoll code, but in future it could modify the fd
1687 : * set in the same call to kevent as it uses for waiting instead of doing that
1688 : * with separate system calls.
1689 : */
1690 : static int
1691 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1692 : WaitEvent *occurred_events, int nevents)
1693 : {
1694 : int returned_events = 0;
1695 : int rc;
1696 : WaitEvent *cur_event;
1697 : struct kevent *cur_kqueue_event;
1698 : struct timespec timeout;
1699 : struct timespec *timeout_p;
1700 :
1701 : if (cur_timeout < 0)
1702 : timeout_p = NULL;
1703 : else
1704 : {
1705 : timeout.tv_sec = cur_timeout / 1000;
1706 : timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
1707 : timeout_p = &timeout;
1708 : }
1709 :
1710 : /*
1711 : * Report postmaster events discovered by WaitEventAdjustKqueue() or an
1712 : * earlier call to WaitEventSetWait().
1713 : */
1714 : if (unlikely(set->report_postmaster_not_running))
1715 : {
1716 : if (set->exit_on_postmaster_death)
1717 : proc_exit(1);
1718 : occurred_events->fd = PGINVALID_SOCKET;
1719 : occurred_events->events = WL_POSTMASTER_DEATH;
1720 : return 1;
1721 : }
1722 :
1723 : /* Sleep */
1724 : rc = kevent(set->kqueue_fd, NULL, 0,
1725 : set->kqueue_ret_events,
1726 : Min(nevents, set->nevents_space),
1727 : timeout_p);
1728 :
1729 : /* Check return code */
1730 : if (rc < 0)
1731 : {
1732 : /* EINTR is okay, otherwise complain */
1733 : if (errno != EINTR)
1734 : {
1735 : waiting = false;
1736 : ereport(ERROR,
1737 : (errcode_for_socket_access(),
1738 : errmsg("%s() failed: %m",
1739 : "kevent")));
1740 : }
1741 : return 0;
1742 : }
1743 : else if (rc == 0)
1744 : {
1745 : /* timeout exceeded */
1746 : return -1;
1747 : }
1748 :
1749 : /*
1750 : * At least one event occurred, iterate over the returned kqueue events
1751 : * until they're either all processed, or we've returned all the events
1752 : * the caller desired.
1753 : */
1754 : for (cur_kqueue_event = set->kqueue_ret_events;
1755 : cur_kqueue_event < (set->kqueue_ret_events + rc) &&
1756 : returned_events < nevents;
1757 : cur_kqueue_event++)
1758 : {
1759 : /* kevent's udata points to the associated WaitEvent */
1760 : cur_event = AccessWaitEvent(cur_kqueue_event);
1761 :
1762 : occurred_events->pos = cur_event->pos;
1763 : occurred_events->user_data = cur_event->user_data;
1764 : occurred_events->events = 0;
1765 :
1766 : if (cur_event->events == WL_LATCH_SET &&
1767 : cur_kqueue_event->filter == EVFILT_SIGNAL)
1768 : {
1769 : if (set->latch && set->latch->is_set)
1770 : {
1771 : occurred_events->fd = PGINVALID_SOCKET;
1772 : occurred_events->events = WL_LATCH_SET;
1773 : occurred_events++;
1774 : returned_events++;
1775 : }
1776 : }
1777 : else if (cur_event->events == WL_POSTMASTER_DEATH &&
1778 : cur_kqueue_event->filter == EVFILT_PROC &&
1779 : (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
1780 : {
1781 : /*
1782 : * The kernel will tell this kqueue object only once about the
1783 : * exit of the postmaster, so let's remember that for next time so
1784 : * that we provide level-triggered semantics.
1785 : */
1786 : set->report_postmaster_not_running = true;
1787 :
1788 : if (set->exit_on_postmaster_death)
1789 : proc_exit(1);
1790 : occurred_events->fd = PGINVALID_SOCKET;
1791 : occurred_events->events = WL_POSTMASTER_DEATH;
1792 : occurred_events++;
1793 : returned_events++;
1794 : }
1795 : else if (cur_event->events & (WL_SOCKET_READABLE |
1796 : WL_SOCKET_WRITEABLE |
1797 : WL_SOCKET_CLOSED))
1798 : {
1799 : Assert(cur_event->fd >= 0);
1800 :
1801 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1802 : (cur_kqueue_event->filter == EVFILT_READ))
1803 : {
1804 : /* readable, or EOF */
1805 : occurred_events->events |= WL_SOCKET_READABLE;
1806 : }
1807 :
1808 : if ((cur_event->events & WL_SOCKET_CLOSED) &&
1809 : (cur_kqueue_event->filter == EVFILT_READ) &&
1810 : (cur_kqueue_event->flags & EV_EOF))
1811 : {
1812 : /* the remote peer has shut down */
1813 : occurred_events->events |= WL_SOCKET_CLOSED;
1814 : }
1815 :
1816 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1817 : (cur_kqueue_event->filter == EVFILT_WRITE))
1818 : {
1819 : /* writable, or EOF */
1820 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1821 : }
1822 :
1823 : if (occurred_events->events != 0)
1824 : {
1825 : occurred_events->fd = cur_event->fd;
1826 : occurred_events++;
1827 : returned_events++;
1828 : }
1829 : }
1830 : }
1831 :
1832 : return returned_events;
1833 : }
1834 :
1835 : #elif defined(WAIT_USE_POLL)
1836 :
1837 : /*
1838 : * Wait using poll(2).
1839 : *
1840 : * This allows to receive readiness notifications for several events at once,
1841 : * but requires iterating through all of set->pollfds.
1842 : */
1843 : static inline int
1844 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1845 : WaitEvent *occurred_events, int nevents)
1846 : {
1847 : int returned_events = 0;
1848 : int rc;
1849 : WaitEvent *cur_event;
1850 : struct pollfd *cur_pollfd;
1851 :
1852 : /* Sleep */
1853 : rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1854 :
1855 : /* Check return code */
1856 : if (rc < 0)
1857 : {
1858 : /* EINTR is okay, otherwise complain */
1859 : if (errno != EINTR)
1860 : {
1861 : waiting = false;
1862 : ereport(ERROR,
1863 : (errcode_for_socket_access(),
1864 : errmsg("%s() failed: %m",
1865 : "poll")));
1866 : }
1867 : return 0;
1868 : }
1869 : else if (rc == 0)
1870 : {
1871 : /* timeout exceeded */
1872 : return -1;
1873 : }
1874 :
1875 : for (cur_event = set->events, cur_pollfd = set->pollfds;
1876 : cur_event < (set->events + set->nevents) &&
1877 : returned_events < nevents;
1878 : cur_event++, cur_pollfd++)
1879 : {
1880 : /* no activity on this FD, skip */
1881 : if (cur_pollfd->revents == 0)
1882 : continue;
1883 :
1884 : occurred_events->pos = cur_event->pos;
1885 : occurred_events->user_data = cur_event->user_data;
1886 : occurred_events->events = 0;
1887 :
1888 : if (cur_event->events == WL_LATCH_SET &&
1889 : (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1890 : {
1891 : /* There's data in the self-pipe, clear it. */
1892 : drain();
1893 :
1894 : if (set->latch && set->latch->is_set)
1895 : {
1896 : occurred_events->fd = PGINVALID_SOCKET;
1897 : occurred_events->events = WL_LATCH_SET;
1898 : occurred_events++;
1899 : returned_events++;
1900 : }
1901 : }
1902 : else if (cur_event->events == WL_POSTMASTER_DEATH &&
1903 : (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1904 : {
1905 : /*
1906 : * We expect an POLLHUP when the remote end is closed, but because
1907 : * we don't expect the pipe to become readable or to have any
1908 : * errors either, treat those cases as postmaster death, too.
1909 : *
1910 : * Be paranoid about a spurious event signaling the postmaster as
1911 : * being dead. There have been reports about that happening with
1912 : * older primitives (select(2) to be specific), and a spurious
1913 : * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1914 : * cost much.
1915 : */
1916 : if (!PostmasterIsAliveInternal())
1917 : {
1918 : if (set->exit_on_postmaster_death)
1919 : proc_exit(1);
1920 : occurred_events->fd = PGINVALID_SOCKET;
1921 : occurred_events->events = WL_POSTMASTER_DEATH;
1922 : occurred_events++;
1923 : returned_events++;
1924 : }
1925 : }
1926 : else if (cur_event->events & (WL_SOCKET_READABLE |
1927 : WL_SOCKET_WRITEABLE |
1928 : WL_SOCKET_CLOSED))
1929 : {
1930 : int errflags = POLLHUP | POLLERR | POLLNVAL;
1931 :
1932 : Assert(cur_event->fd >= PGINVALID_SOCKET);
1933 :
1934 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1935 : (cur_pollfd->revents & (POLLIN | errflags)))
1936 : {
1937 : /* data available in socket, or EOF */
1938 : occurred_events->events |= WL_SOCKET_READABLE;
1939 : }
1940 :
1941 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1942 : (cur_pollfd->revents & (POLLOUT | errflags)))
1943 : {
1944 : /* writeable, or EOF */
1945 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1946 : }
1947 :
1948 : #ifdef POLLRDHUP
1949 : if ((cur_event->events & WL_SOCKET_CLOSED) &&
1950 : (cur_pollfd->revents & (POLLRDHUP | errflags)))
1951 : {
1952 : /* remote peer closed, or error */
1953 : occurred_events->events |= WL_SOCKET_CLOSED;
1954 : }
1955 : #endif
1956 :
1957 : if (occurred_events->events != 0)
1958 : {
1959 : occurred_events->fd = cur_event->fd;
1960 : occurred_events++;
1961 : returned_events++;
1962 : }
1963 : }
1964 : }
1965 : return returned_events;
1966 : }
1967 :
1968 : #elif defined(WAIT_USE_WIN32)
1969 :
1970 : /*
1971 : * Wait using Windows' WaitForMultipleObjects(). Each call only "consumes" one
1972 : * event, so we keep calling until we've filled up our output buffer to match
1973 : * the behavior of the other implementations.
1974 : *
1975 : * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273
1976 : */
1977 : static inline int
1978 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1979 : WaitEvent *occurred_events, int nevents)
1980 : {
1981 : int returned_events = 0;
1982 : DWORD rc;
1983 : WaitEvent *cur_event;
1984 :
1985 : /* Reset any wait events that need it */
1986 : for (cur_event = set->events;
1987 : cur_event < (set->events + set->nevents);
1988 : cur_event++)
1989 : {
1990 : if (cur_event->reset)
1991 : {
1992 : WaitEventAdjustWin32(set, cur_event);
1993 : cur_event->reset = false;
1994 : }
1995 :
1996 : /*
1997 : * We associate the socket with a new event handle for each
1998 : * WaitEventSet. FD_CLOSE is only generated once if the other end
1999 : * closes gracefully. Therefore we might miss the FD_CLOSE
2000 : * notification, if it was delivered to another event after we stopped
2001 : * waiting for it. Close that race by peeking for EOF after setting
2002 : * up this handle to receive notifications, and before entering the
2003 : * sleep.
2004 : *
2005 : * XXX If we had one event handle for the lifetime of a socket, we
2006 : * wouldn't need this.
2007 : */
2008 : if (cur_event->events & WL_SOCKET_READABLE)
2009 : {
2010 : char c;
2011 : WSABUF buf;
2012 : DWORD received;
2013 : DWORD flags;
2014 :
2015 : buf.buf = &c;
2016 : buf.len = 1;
2017 : flags = MSG_PEEK;
2018 : if (WSARecv(cur_event->fd, &buf, 1, &received, &flags, NULL, NULL) == 0)
2019 : {
2020 : occurred_events->pos = cur_event->pos;
2021 : occurred_events->user_data = cur_event->user_data;
2022 : occurred_events->events = WL_SOCKET_READABLE;
2023 : occurred_events->fd = cur_event->fd;
2024 : return 1;
2025 : }
2026 : }
2027 :
2028 : /*
2029 : * Windows does not guarantee to log an FD_WRITE network event
2030 : * indicating that more data can be sent unless the previous send()
2031 : * failed with WSAEWOULDBLOCK. While our caller might well have made
2032 : * such a call, we cannot assume that here. Therefore, if waiting for
2033 : * write-ready, force the issue by doing a dummy send(). If the dummy
2034 : * send() succeeds, assume that the socket is in fact write-ready, and
2035 : * return immediately. Also, if it fails with something other than
2036 : * WSAEWOULDBLOCK, return a write-ready indication to let our caller
2037 : * deal with the error condition.
2038 : */
2039 : if (cur_event->events & WL_SOCKET_WRITEABLE)
2040 : {
2041 : char c;
2042 : WSABUF buf;
2043 : DWORD sent;
2044 : int r;
2045 :
2046 : buf.buf = &c;
2047 : buf.len = 0;
2048 :
2049 : r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
2050 : if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
2051 : {
2052 : occurred_events->pos = cur_event->pos;
2053 : occurred_events->user_data = cur_event->user_data;
2054 : occurred_events->events = WL_SOCKET_WRITEABLE;
2055 : occurred_events->fd = cur_event->fd;
2056 : return 1;
2057 : }
2058 : }
2059 : }
2060 :
2061 : /*
2062 : * Sleep.
2063 : *
2064 : * Need to wait for ->nevents + 1, because signal handle is in [0].
2065 : */
2066 : rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
2067 : cur_timeout);
2068 :
2069 : /* Check return code */
2070 : if (rc == WAIT_FAILED)
2071 : elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
2072 : GetLastError());
2073 : else if (rc == WAIT_TIMEOUT)
2074 : {
2075 : /* timeout exceeded */
2076 : return -1;
2077 : }
2078 :
2079 : if (rc == WAIT_OBJECT_0)
2080 : {
2081 : /* Service newly-arrived signals */
2082 : pgwin32_dispatch_queued_signals();
2083 : return 0; /* retry */
2084 : }
2085 :
2086 : /*
2087 : * With an offset of one, due to the always present pgwin32_signal_event,
2088 : * the handle offset directly corresponds to a wait event.
2089 : */
2090 : cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
2091 :
2092 : for (;;)
2093 : {
2094 : int next_pos;
2095 : int count;
2096 :
2097 : occurred_events->pos = cur_event->pos;
2098 : occurred_events->user_data = cur_event->user_data;
2099 : occurred_events->events = 0;
2100 :
2101 : if (cur_event->events == WL_LATCH_SET)
2102 : {
2103 : /*
2104 : * We cannot use set->latch->event to reset the fired event if we
2105 : * aren't waiting on this latch now.
2106 : */
2107 : if (!ResetEvent(set->handles[cur_event->pos + 1]))
2108 : elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
2109 :
2110 : if (set->latch && set->latch->is_set)
2111 : {
2112 : occurred_events->fd = PGINVALID_SOCKET;
2113 : occurred_events->events = WL_LATCH_SET;
2114 : occurred_events++;
2115 : returned_events++;
2116 : }
2117 : }
2118 : else if (cur_event->events == WL_POSTMASTER_DEATH)
2119 : {
2120 : /*
2121 : * Postmaster apparently died. Since the consequences of falsely
2122 : * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we
2123 : * take the trouble to positively verify this with
2124 : * PostmasterIsAlive(), even though there is no known reason to
2125 : * think that the event could be falsely set on Windows.
2126 : */
2127 : if (!PostmasterIsAliveInternal())
2128 : {
2129 : if (set->exit_on_postmaster_death)
2130 : proc_exit(1);
2131 : occurred_events->fd = PGINVALID_SOCKET;
2132 : occurred_events->events = WL_POSTMASTER_DEATH;
2133 : occurred_events++;
2134 : returned_events++;
2135 : }
2136 : }
2137 : else if (cur_event->events & WL_SOCKET_MASK)
2138 : {
2139 : WSANETWORKEVENTS resEvents;
2140 : HANDLE handle = set->handles[cur_event->pos + 1];
2141 :
2142 : Assert(cur_event->fd);
2143 :
2144 : occurred_events->fd = cur_event->fd;
2145 :
2146 : ZeroMemory(&resEvents, sizeof(resEvents));
2147 : if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
2148 : elog(ERROR, "failed to enumerate network events: error code %d",
2149 : WSAGetLastError());
2150 : if ((cur_event->events & WL_SOCKET_READABLE) &&
2151 : (resEvents.lNetworkEvents & FD_READ))
2152 : {
2153 : /* data available in socket */
2154 : occurred_events->events |= WL_SOCKET_READABLE;
2155 :
2156 : /*------
2157 : * WaitForMultipleObjects doesn't guarantee that a read event
2158 : * will be returned if the latch is set at the same time. Even
2159 : * if it did, the caller might drop that event expecting it to
2160 : * reoccur on next call. So, we must force the event to be
2161 : * reset if this WaitEventSet is used again in order to avoid
2162 : * an indefinite hang.
2163 : *
2164 : * Refer
2165 : * https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
2166 : * for the behavior of socket events.
2167 : *------
2168 : */
2169 : cur_event->reset = true;
2170 : }
2171 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
2172 : (resEvents.lNetworkEvents & FD_WRITE))
2173 : {
2174 : /* writeable */
2175 : occurred_events->events |= WL_SOCKET_WRITEABLE;
2176 : }
2177 : if ((cur_event->events & WL_SOCKET_CONNECTED) &&
2178 : (resEvents.lNetworkEvents & FD_CONNECT))
2179 : {
2180 : /* connected */
2181 : occurred_events->events |= WL_SOCKET_CONNECTED;
2182 : }
2183 : if ((cur_event->events & WL_SOCKET_ACCEPT) &&
2184 : (resEvents.lNetworkEvents & FD_ACCEPT))
2185 : {
2186 : /* incoming connection could be accepted */
2187 : occurred_events->events |= WL_SOCKET_ACCEPT;
2188 : }
2189 : if (resEvents.lNetworkEvents & FD_CLOSE)
2190 : {
2191 : /* EOF/error, so signal all caller-requested socket flags */
2192 : occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
2193 : }
2194 :
2195 : if (occurred_events->events != 0)
2196 : {
2197 : occurred_events++;
2198 : returned_events++;
2199 : }
2200 : }
2201 :
2202 : /* Is the output buffer full? */
2203 : if (returned_events == nevents)
2204 : break;
2205 :
2206 : /* Have we run out of possible events? */
2207 : next_pos = cur_event->pos + 1;
2208 : if (next_pos == set->nevents)
2209 : break;
2210 :
2211 : /*
2212 : * Poll the rest of the event handles in the array starting at
2213 : * next_pos being careful to skip over the initial signal handle too.
2214 : * This time we use a zero timeout.
2215 : */
2216 : count = set->nevents - next_pos;
2217 : rc = WaitForMultipleObjects(count,
2218 : set->handles + 1 + next_pos,
2219 : false,
2220 : 0);
2221 :
2222 : /*
2223 : * We don't distinguish between errors and WAIT_TIMEOUT here because
2224 : * we already have events to report.
2225 : */
2226 : if (rc < WAIT_OBJECT_0 || rc >= WAIT_OBJECT_0 + count)
2227 : break;
2228 :
2229 : /* We have another event to decode. */
2230 : cur_event = &set->events[next_pos + (rc - WAIT_OBJECT_0)];
2231 : }
2232 :
2233 : return returned_events;
2234 : }
2235 : #endif
2236 :
2237 : /*
2238 : * Return whether the current build options can report WL_SOCKET_CLOSED.
2239 : */
2240 : bool
2241 1982 : WaitEventSetCanReportClosed(void)
2242 : {
2243 : #if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
2244 : defined(WAIT_USE_EPOLL) || \
2245 : defined(WAIT_USE_KQUEUE)
2246 1982 : return true;
2247 : #else
2248 : return false;
2249 : #endif
2250 : }
2251 :
2252 : /*
2253 : * Get the number of wait events registered in a given WaitEventSet.
2254 : */
2255 : int
2256 220 : GetNumRegisteredWaitEvents(WaitEventSet *set)
2257 : {
2258 220 : return set->nevents;
2259 : }
2260 :
2261 : #if defined(WAIT_USE_SELF_PIPE)
2262 :
2263 : /*
2264 : * SetLatch uses SIGURG to wake up the process waiting on the latch.
2265 : *
2266 : * Wake up WaitLatch, if we're waiting.
2267 : */
2268 : static void
2269 : latch_sigurg_handler(SIGNAL_ARGS)
2270 : {
2271 : if (waiting)
2272 : sendSelfPipeByte();
2273 : }
2274 :
2275 : /* Send one byte to the self-pipe, to wake up WaitLatch */
2276 : static void
2277 : sendSelfPipeByte(void)
2278 : {
2279 : int rc;
2280 : char dummy = 0;
2281 :
2282 : retry:
2283 : rc = write(selfpipe_writefd, &dummy, 1);
2284 : if (rc < 0)
2285 : {
2286 : /* If interrupted by signal, just retry */
2287 : if (errno == EINTR)
2288 : goto retry;
2289 :
2290 : /*
2291 : * If the pipe is full, we don't need to retry, the data that's there
2292 : * already is enough to wake up WaitLatch.
2293 : */
2294 : if (errno == EAGAIN || errno == EWOULDBLOCK)
2295 : return;
2296 :
2297 : /*
2298 : * Oops, the write() failed for some other reason. We might be in a
2299 : * signal handler, so it's not safe to elog(). We have no choice but
2300 : * silently ignore the error.
2301 : */
2302 : return;
2303 : }
2304 : }
2305 :
2306 : #endif
2307 :
2308 : #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
2309 :
2310 : /*
2311 : * Read all available data from self-pipe or signalfd.
2312 : *
2313 : * Note: this is only called when waiting = true. If it fails and doesn't
2314 : * return, it must reset that flag first (though ideally, this will never
2315 : * happen).
2316 : */
2317 : static void
2318 712060 : drain(void)
2319 : {
2320 : char buf[1024];
2321 : int rc;
2322 : int fd;
2323 :
2324 : #ifdef WAIT_USE_SELF_PIPE
2325 : fd = selfpipe_readfd;
2326 : #else
2327 712060 : fd = signal_fd;
2328 : #endif
2329 :
2330 : for (;;)
2331 : {
2332 712060 : rc = read(fd, buf, sizeof(buf));
2333 712060 : if (rc < 0)
2334 : {
2335 0 : if (errno == EAGAIN || errno == EWOULDBLOCK)
2336 : break; /* the descriptor is empty */
2337 0 : else if (errno == EINTR)
2338 0 : continue; /* retry */
2339 : else
2340 : {
2341 0 : waiting = false;
2342 : #ifdef WAIT_USE_SELF_PIPE
2343 : elog(ERROR, "read() on self-pipe failed: %m");
2344 : #else
2345 0 : elog(ERROR, "read() on signalfd failed: %m");
2346 : #endif
2347 : }
2348 : }
2349 712060 : else if (rc == 0)
2350 : {
2351 0 : waiting = false;
2352 : #ifdef WAIT_USE_SELF_PIPE
2353 : elog(ERROR, "unexpected EOF on self-pipe");
2354 : #else
2355 0 : elog(ERROR, "unexpected EOF on signalfd");
2356 : #endif
2357 : }
2358 712060 : else if (rc < sizeof(buf))
2359 : {
2360 : /* we successfully drained the pipe; no need to read() again */
2361 712060 : break;
2362 : }
2363 : /* else buffer wasn't big enough, so read again */
2364 : }
2365 712060 : }
2366 :
2367 : #endif
2368 :
2369 : static void
2370 2 : ResOwnerReleaseWaitEventSet(Datum res)
2371 : {
2372 2 : WaitEventSet *set = (WaitEventSet *) DatumGetPointer(res);
2373 :
2374 : Assert(set->owner != NULL);
2375 2 : set->owner = NULL;
2376 2 : FreeWaitEventSet(set);
2377 2 : }
|