Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * latch.c
4 : * Routines for inter-process latches
5 : *
6 : * The poll() implementation uses the so-called self-pipe trick to overcome the
7 : * race condition involved with poll() and setting a global flag in the signal
8 : * handler. When a latch is set and the current process is waiting for it, the
9 : * signal handler wakes up the poll() in WaitLatch by writing a byte to a pipe.
10 : * A signal by itself doesn't interrupt poll() on all platforms, and even on
11 : * platforms where it does, a signal that arrives just before the poll() call
12 : * does not prevent poll() from entering sleep. An incoming byte on a pipe
13 : * however reliably interrupts the sleep, and causes poll() to return
14 : * immediately even if the signal arrives before poll() begins.
15 : *
16 : * The epoll() implementation overcomes the race with a different technique: it
17 : * keeps SIGURG blocked and consumes from a signalfd() descriptor instead. We
18 : * don't need to register a signal handler or create our own self-pipe. We
19 : * assume that any system that has Linux epoll() also has Linux signalfd().
20 : *
21 : * The kqueue() implementation waits for SIGURG with EVFILT_SIGNAL.
22 : *
23 : * The Windows implementation uses Windows events that are inherited by all
24 : * postmaster child processes. There's no need for the self-pipe trick there.
25 : *
26 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : * IDENTIFICATION
30 : * src/backend/storage/ipc/latch.c
31 : *
32 : *-------------------------------------------------------------------------
33 : */
34 : #include "postgres.h"
35 :
36 : #include <fcntl.h>
37 : #include <limits.h>
38 : #include <signal.h>
39 : #include <unistd.h>
40 : #ifdef HAVE_SYS_EPOLL_H
41 : #include <sys/epoll.h>
42 : #endif
43 : #ifdef HAVE_SYS_EVENT_H
44 : #include <sys/event.h>
45 : #endif
46 : #ifdef HAVE_SYS_SIGNALFD_H
47 : #include <sys/signalfd.h>
48 : #endif
49 : #ifdef HAVE_POLL_H
50 : #include <poll.h>
51 : #endif
52 :
53 : #include "libpq/pqsignal.h"
54 : #include "miscadmin.h"
55 : #include "pgstat.h"
56 : #include "port/atomics.h"
57 : #include "portability/instr_time.h"
58 : #include "postmaster/postmaster.h"
59 : #include "storage/fd.h"
60 : #include "storage/ipc.h"
61 : #include "storage/latch.h"
62 : #include "storage/pmsignal.h"
63 : #include "utils/memutils.h"
64 : #include "utils/resowner.h"
65 :
66 : /*
67 : * Select the fd readiness primitive to use. Normally the "most modern"
68 : * primitive supported by the OS will be used, but for testing it can be
69 : * useful to manually specify the used primitive. If desired, just add a
70 : * define somewhere before this block.
71 : */
72 : #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
73 : defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
74 : /* don't overwrite manual choice */
75 : #elif defined(HAVE_SYS_EPOLL_H)
76 : #define WAIT_USE_EPOLL
77 : #elif defined(HAVE_KQUEUE)
78 : #define WAIT_USE_KQUEUE
79 : #elif defined(HAVE_POLL)
80 : #define WAIT_USE_POLL
81 : #elif WIN32
82 : #define WAIT_USE_WIN32
83 : #else
84 : #error "no wait set implementation available"
85 : #endif
86 :
87 : /*
88 : * By default, we use a self-pipe with poll() and a signalfd with epoll(), if
89 : * available. For testing the choice can also be manually specified.
90 : */
91 : #if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL)
92 : #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
93 : /* don't overwrite manual choice */
94 : #elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H)
95 : #define WAIT_USE_SIGNALFD
96 : #else
97 : #define WAIT_USE_SELF_PIPE
98 : #endif
99 : #endif
100 :
101 : /* typedef in latch.h */
102 : struct WaitEventSet
103 : {
104 : ResourceOwner owner;
105 :
106 : int nevents; /* number of registered events */
107 : int nevents_space; /* maximum number of events in this set */
108 :
109 : /*
110 : * Array, of nevents_space length, storing the definition of events this
111 : * set is waiting for.
112 : */
113 : WaitEvent *events;
114 :
115 : /*
116 : * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
117 : * said latch, and latch_pos the offset in the ->events array. This is
118 : * useful because we check the state of the latch before performing doing
119 : * syscalls related to waiting.
120 : */
121 : Latch *latch;
122 : int latch_pos;
123 :
124 : /*
125 : * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
126 : * is set so that we'll exit immediately if postmaster death is detected,
127 : * instead of returning.
128 : */
129 : bool exit_on_postmaster_death;
130 :
131 : #if defined(WAIT_USE_EPOLL)
132 : int epoll_fd;
133 : /* epoll_wait returns events in a user provided arrays, allocate once */
134 : struct epoll_event *epoll_ret_events;
135 : #elif defined(WAIT_USE_KQUEUE)
136 : int kqueue_fd;
137 : /* kevent returns events in a user provided arrays, allocate once */
138 : struct kevent *kqueue_ret_events;
139 : bool report_postmaster_not_running;
140 : #elif defined(WAIT_USE_POLL)
141 : /* poll expects events to be waited on every poll() call, prepare once */
142 : struct pollfd *pollfds;
143 : #elif defined(WAIT_USE_WIN32)
144 :
145 : /*
146 : * Array of windows events. The first element always contains
147 : * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
148 : * event->pos + 1).
149 : */
150 : HANDLE *handles;
151 : #endif
152 : };
153 :
154 : /* A common WaitEventSet used to implement WaitLatch() */
155 : static WaitEventSet *LatchWaitSet;
156 :
157 : /* The position of the latch in LatchWaitSet. */
158 : #define LatchWaitSetLatchPos 0
159 :
160 : #ifndef WIN32
161 : /* Are we currently in WaitLatch? The signal handler would like to know. */
162 : static volatile sig_atomic_t waiting = false;
163 : #endif
164 :
165 : #ifdef WAIT_USE_SIGNALFD
166 : /* On Linux, we'll receive SIGURG via a signalfd file descriptor. */
167 : static int signal_fd = -1;
168 : #endif
169 :
170 : #ifdef WAIT_USE_SELF_PIPE
171 : /* Read and write ends of the self-pipe */
172 : static int selfpipe_readfd = -1;
173 : static int selfpipe_writefd = -1;
174 :
175 : /* Process owning the self-pipe --- needed for checking purposes */
176 : static int selfpipe_owner_pid = 0;
177 :
178 : /* Private function prototypes */
179 : static void latch_sigurg_handler(SIGNAL_ARGS);
180 : static void sendSelfPipeByte(void);
181 : #endif
182 :
183 : #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
184 : static void drain(void);
185 : #endif
186 :
187 : #if defined(WAIT_USE_EPOLL)
188 : static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
189 : #elif defined(WAIT_USE_KQUEUE)
190 : static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
191 : #elif defined(WAIT_USE_POLL)
192 : static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
193 : #elif defined(WAIT_USE_WIN32)
194 : static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
195 : #endif
196 :
197 : static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
198 : WaitEvent *occurred_events, int nevents);
199 :
200 : /* ResourceOwner support to hold WaitEventSets */
201 : static void ResOwnerReleaseWaitEventSet(Datum res);
202 :
203 : static const ResourceOwnerDesc wait_event_set_resowner_desc =
204 : {
205 : .name = "WaitEventSet",
206 : .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
207 : .release_priority = RELEASE_PRIO_WAITEVENTSETS,
208 : .ReleaseResource = ResOwnerReleaseWaitEventSet,
209 : .DebugPrint = NULL
210 : };
211 :
212 : /* Convenience wrappers over ResourceOwnerRemember/Forget */
213 : static inline void
214 137000 : ResourceOwnerRememberWaitEventSet(ResourceOwner owner, WaitEventSet *set)
215 : {
216 137000 : ResourceOwnerRemember(owner, PointerGetDatum(set), &wait_event_set_resowner_desc);
217 137000 : }
218 : static inline void
219 136998 : ResourceOwnerForgetWaitEventSet(ResourceOwner owner, WaitEventSet *set)
220 : {
221 136998 : ResourceOwnerForget(owner, PointerGetDatum(set), &wait_event_set_resowner_desc);
222 136998 : }
223 :
224 :
225 : /*
226 : * Initialize the process-local latch infrastructure.
227 : *
228 : * This must be called once during startup of any process that can wait on
229 : * latches, before it issues any InitLatch() or OwnLatch() calls.
230 : */
231 : void
232 31784 : InitializeLatchSupport(void)
233 : {
234 : #if defined(WAIT_USE_SELF_PIPE)
235 : int pipefd[2];
236 :
237 : if (IsUnderPostmaster)
238 : {
239 : /*
240 : * We might have inherited connections to a self-pipe created by the
241 : * postmaster. It's critical that child processes create their own
242 : * self-pipes, of course, and we really want them to close the
243 : * inherited FDs for safety's sake.
244 : */
245 : if (selfpipe_owner_pid != 0)
246 : {
247 : /* Assert we go through here but once in a child process */
248 : Assert(selfpipe_owner_pid != MyProcPid);
249 : /* Release postmaster's pipe FDs; ignore any error */
250 : (void) close(selfpipe_readfd);
251 : (void) close(selfpipe_writefd);
252 : /* Clean up, just for safety's sake; we'll set these below */
253 : selfpipe_readfd = selfpipe_writefd = -1;
254 : selfpipe_owner_pid = 0;
255 : /* Keep fd.c's accounting straight */
256 : ReleaseExternalFD();
257 : ReleaseExternalFD();
258 : }
259 : else
260 : {
261 : /*
262 : * Postmaster didn't create a self-pipe ... or else we're in an
263 : * EXEC_BACKEND build, in which case it doesn't matter since the
264 : * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
265 : * fd.c won't have state to clean up, either.
266 : */
267 : Assert(selfpipe_readfd == -1);
268 : }
269 : }
270 : else
271 : {
272 : /* In postmaster or standalone backend, assert we do this but once */
273 : Assert(selfpipe_readfd == -1);
274 : Assert(selfpipe_owner_pid == 0);
275 : }
276 :
277 : /*
278 : * Set up the self-pipe that allows a signal handler to wake up the
279 : * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
280 : * that SetLatch won't block if the event has already been set many times
281 : * filling the kernel buffer. Make the read-end non-blocking too, so that
282 : * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
283 : * Also, make both FDs close-on-exec, since we surely do not want any
284 : * child processes messing with them.
285 : */
286 : if (pipe(pipefd) < 0)
287 : elog(FATAL, "pipe() failed: %m");
288 : if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
289 : elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
290 : if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
291 : elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
292 : if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
293 : elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
294 : if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
295 : elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
296 :
297 : selfpipe_readfd = pipefd[0];
298 : selfpipe_writefd = pipefd[1];
299 : selfpipe_owner_pid = MyProcPid;
300 :
301 : /* Tell fd.c about these two long-lived FDs */
302 : ReserveExternalFD();
303 : ReserveExternalFD();
304 :
305 : pqsignal(SIGURG, latch_sigurg_handler);
306 : #endif
307 :
308 : #ifdef WAIT_USE_SIGNALFD
309 : sigset_t signalfd_mask;
310 :
311 31784 : if (IsUnderPostmaster)
312 : {
313 : /*
314 : * It would probably be safe to re-use the inherited signalfd since
315 : * signalfds only see the current process's pending signals, but it
316 : * seems less surprising to close it and create our own.
317 : */
318 29954 : if (signal_fd != -1)
319 : {
320 : /* Release postmaster's signal FD; ignore any error */
321 29954 : (void) close(signal_fd);
322 29954 : signal_fd = -1;
323 29954 : ReleaseExternalFD();
324 : }
325 : }
326 :
327 : /* Block SIGURG, because we'll receive it through a signalfd. */
328 31784 : sigaddset(&UnBlockSig, SIGURG);
329 :
330 : /* Set up the signalfd to receive SIGURG notifications. */
331 31784 : sigemptyset(&signalfd_mask);
332 31784 : sigaddset(&signalfd_mask, SIGURG);
333 31784 : signal_fd = signalfd(-1, &signalfd_mask, SFD_NONBLOCK | SFD_CLOEXEC);
334 31784 : if (signal_fd < 0)
335 0 : elog(FATAL, "signalfd() failed");
336 31784 : ReserveExternalFD();
337 : #endif
338 :
339 : #ifdef WAIT_USE_KQUEUE
340 : /* Ignore SIGURG, because we'll receive it via kqueue. */
341 : pqsignal(SIGURG, SIG_IGN);
342 : #endif
343 31784 : }
344 :
345 : void
346 30346 : InitializeLatchWaitSet(void)
347 : {
348 : int latch_pos PG_USED_FOR_ASSERTS_ONLY;
349 :
350 : Assert(LatchWaitSet == NULL);
351 :
352 : /* Set up the WaitEventSet used by WaitLatch(). */
353 30346 : LatchWaitSet = CreateWaitEventSet(NULL, 2);
354 30346 : latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
355 : MyLatch, NULL);
356 30346 : if (IsUnderPostmaster)
357 29954 : AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH,
358 : PGINVALID_SOCKET, NULL, NULL);
359 :
360 : Assert(latch_pos == LatchWaitSetLatchPos);
361 30346 : }
362 :
363 : void
364 0 : ShutdownLatchSupport(void)
365 : {
366 : #if defined(WAIT_USE_POLL)
367 : pqsignal(SIGURG, SIG_IGN);
368 : #endif
369 :
370 0 : if (LatchWaitSet)
371 : {
372 0 : FreeWaitEventSet(LatchWaitSet);
373 0 : LatchWaitSet = NULL;
374 : }
375 :
376 : #if defined(WAIT_USE_SELF_PIPE)
377 : close(selfpipe_readfd);
378 : close(selfpipe_writefd);
379 : selfpipe_readfd = -1;
380 : selfpipe_writefd = -1;
381 : selfpipe_owner_pid = InvalidPid;
382 : #endif
383 :
384 : #if defined(WAIT_USE_SIGNALFD)
385 0 : close(signal_fd);
386 0 : signal_fd = -1;
387 : #endif
388 0 : }
389 :
390 : /*
391 : * Initialize a process-local latch.
392 : */
393 : void
394 31784 : InitLatch(Latch *latch)
395 : {
396 31784 : latch->is_set = false;
397 31784 : latch->maybe_sleeping = false;
398 31784 : latch->owner_pid = MyProcPid;
399 31784 : latch->is_shared = false;
400 :
401 : #if defined(WAIT_USE_SELF_PIPE)
402 : /* Assert InitializeLatchSupport has been called in this process */
403 : Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
404 : #elif defined(WAIT_USE_SIGNALFD)
405 : /* Assert InitializeLatchSupport has been called in this process */
406 : Assert(signal_fd >= 0);
407 : #elif defined(WAIT_USE_WIN32)
408 : latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
409 : if (latch->event == NULL)
410 : elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
411 : #endif /* WIN32 */
412 31784 : }
413 :
414 : /*
415 : * Initialize a shared latch that can be set from other processes. The latch
416 : * is initially owned by no-one; use OwnLatch to associate it with the
417 : * current process.
418 : *
419 : * InitSharedLatch needs to be called in postmaster before forking child
420 : * processes, usually right after allocating the shared memory block
421 : * containing the latch with ShmemInitStruct. (The Unix implementation
422 : * doesn't actually require that, but the Windows one does.) Because of
423 : * this restriction, we have no concurrency issues to worry about here.
424 : *
425 : * Note that other handles created in this module are never marked as
426 : * inheritable. Thus we do not need to worry about cleaning up child
427 : * process references to postmaster-private latches or WaitEventSets.
428 : */
429 : void
430 158636 : InitSharedLatch(Latch *latch)
431 : {
432 : #ifdef WIN32
433 : SECURITY_ATTRIBUTES sa;
434 :
435 : /*
436 : * Set up security attributes to specify that the events are inherited.
437 : */
438 : ZeroMemory(&sa, sizeof(sa));
439 : sa.nLength = sizeof(sa);
440 : sa.bInheritHandle = TRUE;
441 :
442 : latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
443 : if (latch->event == NULL)
444 : elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
445 : #endif
446 :
447 158636 : latch->is_set = false;
448 158636 : latch->maybe_sleeping = false;
449 158636 : latch->owner_pid = 0;
450 158636 : latch->is_shared = true;
451 158636 : }
452 :
453 : /*
454 : * Associate a shared latch with the current process, allowing it to
455 : * wait on the latch.
456 : *
457 : * Although there is a sanity check for latch-already-owned, we don't do
458 : * any sort of locking here, meaning that we could fail to detect the error
459 : * if two processes try to own the same latch at about the same time. If
460 : * there is any risk of that, caller must provide an interlock to prevent it.
461 : */
462 : void
463 30026 : OwnLatch(Latch *latch)
464 : {
465 : int owner_pid;
466 :
467 : /* Sanity checks */
468 : Assert(latch->is_shared);
469 :
470 : #if defined(WAIT_USE_SELF_PIPE)
471 : /* Assert InitializeLatchSupport has been called in this process */
472 : Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
473 : #elif defined(WAIT_USE_SIGNALFD)
474 : /* Assert InitializeLatchSupport has been called in this process */
475 : Assert(signal_fd >= 0);
476 : #endif
477 :
478 30026 : owner_pid = latch->owner_pid;
479 30026 : if (owner_pid != 0)
480 0 : elog(PANIC, "latch already owned by PID %d", owner_pid);
481 :
482 30026 : latch->owner_pid = MyProcPid;
483 30026 : }
484 :
485 : /*
486 : * Disown a shared latch currently owned by the current process.
487 : */
488 : void
489 29926 : DisownLatch(Latch *latch)
490 : {
491 : Assert(latch->is_shared);
492 : Assert(latch->owner_pid == MyProcPid);
493 :
494 29926 : latch->owner_pid = 0;
495 29926 : }
496 :
497 : /*
498 : * Wait for a given latch to be set, or for postmaster death, or until timeout
499 : * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
500 : * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
501 : * function returns immediately.
502 : *
503 : * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
504 : * is given. Although it is declared as "long", we don't actually support
505 : * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
506 : * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
507 : *
508 : * The latch must be owned by the current process, ie. it must be a
509 : * process-local latch initialized with InitLatch, or a shared latch
510 : * associated with the current process by calling OwnLatch.
511 : *
512 : * Returns bit mask indicating which condition(s) caused the wake-up. Note
513 : * that if multiple wake-up conditions are true, there is no guarantee that
514 : * we return all of them in one call, but we will return at least one.
515 : */
516 : int
517 801046 : WaitLatch(Latch *latch, int wakeEvents, long timeout,
518 : uint32 wait_event_info)
519 : {
520 : WaitEvent event;
521 :
522 : /* Postmaster-managed callers must handle postmaster death somehow. */
523 : Assert(!IsUnderPostmaster ||
524 : (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
525 : (wakeEvents & WL_POSTMASTER_DEATH));
526 :
527 : /*
528 : * Some callers may have a latch other than MyLatch, or no latch at all,
529 : * or want to handle postmaster death differently. It's cheap to assign
530 : * those, so just do it every time.
531 : */
532 801046 : if (!(wakeEvents & WL_LATCH_SET))
533 6 : latch = NULL;
534 801046 : ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
535 801046 : LatchWaitSet->exit_on_postmaster_death =
536 801046 : ((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
537 :
538 801046 : if (WaitEventSetWait(LatchWaitSet,
539 801046 : (wakeEvents & WL_TIMEOUT) ? timeout : -1,
540 : &event, 1,
541 : wait_event_info) == 0)
542 28702 : return WL_TIMEOUT;
543 : else
544 772314 : return event.events;
545 : }
546 :
547 : /*
548 : * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
549 : * conditions.
550 : *
551 : * When waiting on a socket, EOF and error conditions always cause the socket
552 : * to be reported as readable/writable/connected, so that the caller can deal
553 : * with the condition.
554 : *
555 : * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
556 : * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
557 : * return value if the postmaster dies. The latter is useful for rare cases
558 : * where some behavior other than immediate exit is needed.
559 : *
560 : * NB: These days this is just a wrapper around the WaitEventSet API. When
561 : * using a latch very frequently, consider creating a longer living
562 : * WaitEventSet instead; that's more efficient.
563 : */
564 : int
565 169262 : WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
566 : long timeout, uint32 wait_event_info)
567 : {
568 169262 : int ret = 0;
569 : int rc;
570 : WaitEvent event;
571 169262 : WaitEventSet *set = CreateWaitEventSet(CurrentResourceOwner, 3);
572 :
573 169262 : if (wakeEvents & WL_TIMEOUT)
574 : Assert(timeout >= 0);
575 : else
576 26252 : timeout = -1;
577 :
578 169262 : if (wakeEvents & WL_LATCH_SET)
579 169054 : AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
580 : latch, NULL);
581 :
582 : /* Postmaster-managed callers must handle postmaster death somehow. */
583 : Assert(!IsUnderPostmaster ||
584 : (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
585 : (wakeEvents & WL_POSTMASTER_DEATH));
586 :
587 169262 : if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
588 0 : AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
589 : NULL, NULL);
590 :
591 169262 : if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
592 169262 : AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
593 : NULL, NULL);
594 :
595 169262 : if (wakeEvents & WL_SOCKET_MASK)
596 : {
597 : int ev;
598 :
599 169262 : ev = wakeEvents & WL_SOCKET_MASK;
600 169262 : AddWaitEventToSet(set, ev, sock, NULL, NULL);
601 : }
602 :
603 169262 : rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
604 :
605 169262 : if (rc == 0)
606 250 : ret |= WL_TIMEOUT;
607 : else
608 : {
609 169012 : ret |= event.events & (WL_LATCH_SET |
610 : WL_POSTMASTER_DEATH |
611 : WL_SOCKET_MASK);
612 : }
613 :
614 169262 : FreeWaitEventSet(set);
615 :
616 169262 : return ret;
617 : }
618 :
619 : /*
620 : * Sets a latch and wakes up anyone waiting on it.
621 : *
622 : * This is cheap if the latch is already set, otherwise not so much.
623 : *
624 : * NB: when calling this in a signal handler, be sure to save and restore
625 : * errno around it. (That's standard practice in most signal handlers, of
626 : * course, but we used to omit it in handlers that only set a flag.)
627 : *
628 : * NB: this function is called from critical sections and signal handlers so
629 : * throwing an error is not a good idea.
630 : */
631 : void
632 1258560 : SetLatch(Latch *latch)
633 : {
634 : #ifndef WIN32
635 : pid_t owner_pid;
636 : #else
637 : HANDLE handle;
638 : #endif
639 :
640 : /*
641 : * The memory barrier has to be placed here to ensure that any flag
642 : * variables possibly changed by this process have been flushed to main
643 : * memory, before we check/set is_set.
644 : */
645 1258560 : pg_memory_barrier();
646 :
647 : /* Quick exit if already set */
648 1258560 : if (latch->is_set)
649 298398 : return;
650 :
651 960162 : latch->is_set = true;
652 :
653 960162 : pg_memory_barrier();
654 960162 : if (!latch->maybe_sleeping)
655 170644 : return;
656 :
657 : #ifndef WIN32
658 :
659 : /*
660 : * See if anyone's waiting for the latch. It can be the current process if
661 : * we're in a signal handler. We use the self-pipe or SIGURG to ourselves
662 : * to wake up WaitEventSetWaitBlock() without races in that case. If it's
663 : * another process, send a signal.
664 : *
665 : * Fetch owner_pid only once, in case the latch is concurrently getting
666 : * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
667 : * guaranteed to be true! In practice, the effective range of pid_t fits
668 : * in a 32 bit integer, and so should be atomic. In the worst case, we
669 : * might end up signaling the wrong process. Even then, you're very
670 : * unlucky if a process with that bogus pid exists and belongs to
671 : * Postgres; and PG database processes should handle excess SIGUSR1
672 : * interrupts without a problem anyhow.
673 : *
674 : * Another sort of race condition that's possible here is for a new
675 : * process to own the latch immediately after we look, so we don't signal
676 : * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
677 : * the standard coding convention of waiting at the bottom of their loops,
678 : * not the top, so that they'll correctly process latch-setting events
679 : * that happen before they enter the loop.
680 : */
681 789518 : owner_pid = latch->owner_pid;
682 789518 : if (owner_pid == 0)
683 0 : return;
684 789518 : else if (owner_pid == MyProcPid)
685 : {
686 : #if defined(WAIT_USE_SELF_PIPE)
687 : if (waiting)
688 : sendSelfPipeByte();
689 : #else
690 45080 : if (waiting)
691 45080 : kill(MyProcPid, SIGURG);
692 : #endif
693 : }
694 : else
695 744438 : kill(owner_pid, SIGURG);
696 :
697 : #else
698 :
699 : /*
700 : * See if anyone's waiting for the latch. It can be the current process if
701 : * we're in a signal handler.
702 : *
703 : * Use a local variable here just in case somebody changes the event field
704 : * concurrently (which really should not happen).
705 : */
706 : handle = latch->event;
707 : if (handle)
708 : {
709 : SetEvent(handle);
710 :
711 : /*
712 : * Note that we silently ignore any errors. We might be in a signal
713 : * handler or other critical path where it's not safe to call elog().
714 : */
715 : }
716 : #endif
717 : }
718 :
719 : /*
720 : * Clear the latch. Calling WaitLatch after this will sleep, unless
721 : * the latch is set again before the WaitLatch call.
722 : */
723 : void
724 2746082 : ResetLatch(Latch *latch)
725 : {
726 : /* Only the owner should reset the latch */
727 : Assert(latch->owner_pid == MyProcPid);
728 : Assert(latch->maybe_sleeping == false);
729 :
730 2746082 : latch->is_set = false;
731 :
732 : /*
733 : * Ensure that the write to is_set gets flushed to main memory before we
734 : * examine any flag variables. Otherwise a concurrent SetLatch might
735 : * falsely conclude that it needn't signal us, even though we have missed
736 : * seeing some flag updates that SetLatch was supposed to inform us of.
737 : */
738 2746082 : pg_memory_barrier();
739 2746082 : }
740 :
741 : /*
742 : * Create a WaitEventSet with space for nevents different events to wait for.
743 : *
744 : * These events can then be efficiently waited upon together, using
745 : * WaitEventSetWait().
746 : *
747 : * The WaitEventSet is tracked by the given 'resowner'. Use NULL for session
748 : * lifetime.
749 : */
750 : WaitEventSet *
751 223774 : CreateWaitEventSet(ResourceOwner resowner, int nevents)
752 : {
753 : WaitEventSet *set;
754 : char *data;
755 223774 : Size sz = 0;
756 :
757 : /*
758 : * Use MAXALIGN size/alignment to guarantee that later uses of memory are
759 : * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
760 : * platforms, but earlier allocations like WaitEventSet and WaitEvent
761 : * might not be sized to guarantee that when purely using sizeof().
762 : */
763 223774 : sz += MAXALIGN(sizeof(WaitEventSet));
764 223774 : sz += MAXALIGN(sizeof(WaitEvent) * nevents);
765 :
766 : #if defined(WAIT_USE_EPOLL)
767 223774 : sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
768 : #elif defined(WAIT_USE_KQUEUE)
769 : sz += MAXALIGN(sizeof(struct kevent) * nevents);
770 : #elif defined(WAIT_USE_POLL)
771 : sz += MAXALIGN(sizeof(struct pollfd) * nevents);
772 : #elif defined(WAIT_USE_WIN32)
773 : /* need space for the pgwin32_signal_event */
774 : sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
775 : #endif
776 :
777 223774 : if (resowner != NULL)
778 137000 : ResourceOwnerEnlarge(resowner);
779 :
780 223774 : data = (char *) MemoryContextAllocZero(TopMemoryContext, sz);
781 :
782 223774 : set = (WaitEventSet *) data;
783 223774 : data += MAXALIGN(sizeof(WaitEventSet));
784 :
785 223774 : set->events = (WaitEvent *) data;
786 223774 : data += MAXALIGN(sizeof(WaitEvent) * nevents);
787 :
788 : #if defined(WAIT_USE_EPOLL)
789 223774 : set->epoll_ret_events = (struct epoll_event *) data;
790 223774 : data += MAXALIGN(sizeof(struct epoll_event) * nevents);
791 : #elif defined(WAIT_USE_KQUEUE)
792 : set->kqueue_ret_events = (struct kevent *) data;
793 : data += MAXALIGN(sizeof(struct kevent) * nevents);
794 : #elif defined(WAIT_USE_POLL)
795 : set->pollfds = (struct pollfd *) data;
796 : data += MAXALIGN(sizeof(struct pollfd) * nevents);
797 : #elif defined(WAIT_USE_WIN32)
798 : set->handles = (HANDLE) data;
799 : data += MAXALIGN(sizeof(HANDLE) * nevents);
800 : #endif
801 :
802 223774 : set->latch = NULL;
803 223774 : set->nevents_space = nevents;
804 223774 : set->exit_on_postmaster_death = false;
805 :
806 223774 : if (resowner != NULL)
807 : {
808 137000 : ResourceOwnerRememberWaitEventSet(resowner, set);
809 137000 : set->owner = resowner;
810 : }
811 :
812 : #if defined(WAIT_USE_EPOLL)
813 223774 : if (!AcquireExternalFD())
814 : {
815 : /* treat this as though epoll_create1 itself returned EMFILE */
816 0 : elog(ERROR, "epoll_create1 failed: %m");
817 : }
818 223774 : set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
819 223774 : if (set->epoll_fd < 0)
820 : {
821 0 : ReleaseExternalFD();
822 0 : elog(ERROR, "epoll_create1 failed: %m");
823 : }
824 : #elif defined(WAIT_USE_KQUEUE)
825 : if (!AcquireExternalFD())
826 : {
827 : /* treat this as though kqueue itself returned EMFILE */
828 : elog(ERROR, "kqueue failed: %m");
829 : }
830 : set->kqueue_fd = kqueue();
831 : if (set->kqueue_fd < 0)
832 : {
833 : ReleaseExternalFD();
834 : elog(ERROR, "kqueue failed: %m");
835 : }
836 : if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
837 : {
838 : int save_errno = errno;
839 :
840 : close(set->kqueue_fd);
841 : ReleaseExternalFD();
842 : errno = save_errno;
843 : elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
844 : }
845 : set->report_postmaster_not_running = false;
846 : #elif defined(WAIT_USE_WIN32)
847 :
848 : /*
849 : * To handle signals while waiting, we need to add a win32 specific event.
850 : * We accounted for the additional event at the top of this routine. See
851 : * port/win32/signal.c for more details.
852 : *
853 : * Note: pgwin32_signal_event should be first to ensure that it will be
854 : * reported when multiple events are set. We want to guarantee that
855 : * pending signals are serviced.
856 : */
857 : set->handles[0] = pgwin32_signal_event;
858 : StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
859 : #endif
860 :
861 223774 : return set;
862 : }
863 :
864 : /*
865 : * Free a previously created WaitEventSet.
866 : *
867 : * Note: preferably, this shouldn't have to free any resources that could be
868 : * inherited across an exec(). If it did, we'd likely leak those resources in
869 : * many scenarios. For the epoll case, we ensure that by setting EPOLL_CLOEXEC
870 : * when the FD is created. For the Windows case, we assume that the handles
871 : * involved are non-inheritable.
872 : */
873 : void
874 170956 : FreeWaitEventSet(WaitEventSet *set)
875 : {
876 170956 : if (set->owner)
877 : {
878 136998 : ResourceOwnerForgetWaitEventSet(set->owner, set);
879 136998 : set->owner = NULL;
880 : }
881 :
882 : #if defined(WAIT_USE_EPOLL)
883 170956 : close(set->epoll_fd);
884 170956 : ReleaseExternalFD();
885 : #elif defined(WAIT_USE_KQUEUE)
886 : close(set->kqueue_fd);
887 : ReleaseExternalFD();
888 : #elif defined(WAIT_USE_WIN32)
889 : for (WaitEvent *cur_event = set->events;
890 : cur_event < (set->events + set->nevents);
891 : cur_event++)
892 : {
893 : if (cur_event->events & WL_LATCH_SET)
894 : {
895 : /* uses the latch's HANDLE */
896 : }
897 : else if (cur_event->events & WL_POSTMASTER_DEATH)
898 : {
899 : /* uses PostmasterHandle */
900 : }
901 : else
902 : {
903 : /* Clean up the event object we created for the socket */
904 : WSAEventSelect(cur_event->fd, NULL, 0);
905 : WSACloseEvent(set->handles[cur_event->pos + 1]);
906 : }
907 : }
908 : #endif
909 :
910 170956 : pfree(set);
911 170956 : }
912 :
913 : /*
914 : * Free a previously created WaitEventSet in a child process after a fork().
915 : */
916 : void
917 26970 : FreeWaitEventSetAfterFork(WaitEventSet *set)
918 : {
919 : #if defined(WAIT_USE_EPOLL)
920 26970 : close(set->epoll_fd);
921 26970 : ReleaseExternalFD();
922 : #elif defined(WAIT_USE_KQUEUE)
923 : /* kqueues are not normally inherited by child processes */
924 : ReleaseExternalFD();
925 : #endif
926 :
927 26970 : pfree(set);
928 26970 : }
929 :
930 : /* ---
931 : * Add an event to the set. Possible events are:
932 : * - WL_LATCH_SET: Wait for the latch to be set
933 : * - WL_POSTMASTER_DEATH: Wait for postmaster to die
934 : * - WL_SOCKET_READABLE: Wait for socket to become readable,
935 : * can be combined in one event with other WL_SOCKET_* events
936 : * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
937 : * can be combined with other WL_SOCKET_* events
938 : * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
939 : * can be combined with other WL_SOCKET_* events (on non-Windows
940 : * platforms, this is the same as WL_SOCKET_WRITEABLE)
941 : * - WL_SOCKET_ACCEPT: Wait for new connection to a server socket,
942 : * can be combined with other WL_SOCKET_* events (on non-Windows
943 : * platforms, this is the same as WL_SOCKET_READABLE)
944 : * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
945 : * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
946 : *
947 : * Returns the offset in WaitEventSet->events (starting from 0), which can be
948 : * used to modify previously added wait events using ModifyWaitEvent().
949 : *
950 : * In the WL_LATCH_SET case the latch must be owned by the current process,
951 : * i.e. it must be a process-local latch initialized with InitLatch, or a
952 : * shared latch associated with the current process by calling OwnLatch.
953 : *
954 : * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED/ACCEPT cases, EOF and error
955 : * conditions cause the socket to be reported as readable/writable/connected,
956 : * so that the caller can deal with the condition.
957 : *
958 : * The user_data pointer specified here will be set for the events returned
959 : * by WaitEventSetWait(), allowing to easily associate additional data with
960 : * events.
961 : */
962 : int
963 636006 : AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
964 : void *user_data)
965 : {
966 : WaitEvent *event;
967 :
968 : /* not enough space */
969 : Assert(set->nevents < set->nevents_space);
970 :
971 636006 : if (events == WL_EXIT_ON_PM_DEATH)
972 : {
973 199450 : events = WL_POSTMASTER_DEATH;
974 199450 : set->exit_on_postmaster_death = true;
975 : }
976 :
977 636006 : if (latch)
978 : {
979 223332 : if (latch->owner_pid != MyProcPid)
980 0 : elog(ERROR, "cannot wait on a latch owned by another process");
981 223332 : if (set->latch)
982 0 : elog(ERROR, "cannot wait on more than one latch");
983 223332 : if ((events & WL_LATCH_SET) != WL_LATCH_SET)
984 0 : elog(ERROR, "latch events only support being set");
985 : }
986 : else
987 : {
988 412674 : if (events & WL_LATCH_SET)
989 0 : elog(ERROR, "cannot wait on latch without a specified latch");
990 : }
991 :
992 : /* waiting for socket readiness without a socket indicates a bug */
993 636006 : if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
994 0 : elog(ERROR, "cannot wait on socket event without a socket");
995 :
996 636006 : event = &set->events[set->nevents];
997 636006 : event->pos = set->nevents++;
998 636006 : event->fd = fd;
999 636006 : event->events = events;
1000 636006 : event->user_data = user_data;
1001 : #ifdef WIN32
1002 : event->reset = false;
1003 : #endif
1004 :
1005 636006 : if (events == WL_LATCH_SET)
1006 : {
1007 223332 : set->latch = latch;
1008 223332 : set->latch_pos = event->pos;
1009 : #if defined(WAIT_USE_SELF_PIPE)
1010 : event->fd = selfpipe_readfd;
1011 : #elif defined(WAIT_USE_SIGNALFD)
1012 223332 : event->fd = signal_fd;
1013 : #else
1014 : event->fd = PGINVALID_SOCKET;
1015 : #ifdef WAIT_USE_EPOLL
1016 : return event->pos;
1017 : #endif
1018 : #endif
1019 : }
1020 412674 : else if (events == WL_POSTMASTER_DEATH)
1021 : {
1022 : #ifndef WIN32
1023 220498 : event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
1024 : #endif
1025 : }
1026 :
1027 : /* perform wait primitive specific initialization, if needed */
1028 : #if defined(WAIT_USE_EPOLL)
1029 636006 : WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
1030 : #elif defined(WAIT_USE_KQUEUE)
1031 : WaitEventAdjustKqueue(set, event, 0);
1032 : #elif defined(WAIT_USE_POLL)
1033 : WaitEventAdjustPoll(set, event);
1034 : #elif defined(WAIT_USE_WIN32)
1035 : WaitEventAdjustWin32(set, event);
1036 : #endif
1037 :
1038 636006 : return event->pos;
1039 : }
1040 :
1041 : /*
1042 : * Change the event mask and, in the WL_LATCH_SET case, the latch associated
1043 : * with the WaitEvent. The latch may be changed to NULL to disable the latch
1044 : * temporarily, and then set back to a latch later.
1045 : *
1046 : * 'pos' is the id returned by AddWaitEventToSet.
1047 : */
1048 : void
1049 1220010 : ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
1050 : {
1051 : WaitEvent *event;
1052 : #if defined(WAIT_USE_KQUEUE)
1053 : int old_events;
1054 : #endif
1055 :
1056 : Assert(pos < set->nevents);
1057 :
1058 1220010 : event = &set->events[pos];
1059 : #if defined(WAIT_USE_KQUEUE)
1060 : old_events = event->events;
1061 : #endif
1062 :
1063 : /*
1064 : * If neither the event mask nor the associated latch changes, return
1065 : * early. That's an important optimization for some sockets, where
1066 : * ModifyWaitEvent is frequently used to switch from waiting for reads to
1067 : * waiting on writes.
1068 : */
1069 1220010 : if (events == event->events &&
1070 1202428 : (!(event->events & WL_LATCH_SET) || set->latch == latch))
1071 1154540 : return;
1072 :
1073 65470 : if (event->events & WL_LATCH_SET &&
1074 47888 : events != event->events)
1075 : {
1076 0 : elog(ERROR, "cannot modify latch event");
1077 : }
1078 :
1079 65470 : if (event->events & WL_POSTMASTER_DEATH)
1080 : {
1081 0 : elog(ERROR, "cannot modify postmaster death event");
1082 : }
1083 :
1084 : /* FIXME: validate event mask */
1085 65470 : event->events = events;
1086 :
1087 65470 : if (events == WL_LATCH_SET)
1088 : {
1089 47888 : if (latch && latch->owner_pid != MyProcPid)
1090 0 : elog(ERROR, "cannot wait on a latch owned by another process");
1091 47888 : set->latch = latch;
1092 :
1093 : /*
1094 : * On Unix, we don't need to modify the kernel object because the
1095 : * underlying pipe (if there is one) is the same for all latches so we
1096 : * can return immediately. On Windows, we need to update our array of
1097 : * handles, but we leave the old one in place and tolerate spurious
1098 : * wakeups if the latch is disabled.
1099 : */
1100 : #if defined(WAIT_USE_WIN32)
1101 : if (!latch)
1102 : return;
1103 : #else
1104 47888 : return;
1105 : #endif
1106 : }
1107 :
1108 : #if defined(WAIT_USE_EPOLL)
1109 17582 : WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
1110 : #elif defined(WAIT_USE_KQUEUE)
1111 : WaitEventAdjustKqueue(set, event, old_events);
1112 : #elif defined(WAIT_USE_POLL)
1113 : WaitEventAdjustPoll(set, event);
1114 : #elif defined(WAIT_USE_WIN32)
1115 : WaitEventAdjustWin32(set, event);
1116 : #endif
1117 : }
1118 :
1119 : #if defined(WAIT_USE_EPOLL)
1120 : /*
1121 : * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
1122 : */
1123 : static void
1124 653588 : WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
1125 : {
1126 : struct epoll_event epoll_ev;
1127 : int rc;
1128 :
1129 : /* pointer to our event, returned by epoll_wait */
1130 653588 : epoll_ev.data.ptr = event;
1131 : /* always wait for errors */
1132 653588 : epoll_ev.events = EPOLLERR | EPOLLHUP;
1133 :
1134 : /* prepare pollfd entry once */
1135 653588 : if (event->events == WL_LATCH_SET)
1136 : {
1137 : Assert(set->latch != NULL);
1138 223332 : epoll_ev.events |= EPOLLIN;
1139 : }
1140 430256 : else if (event->events == WL_POSTMASTER_DEATH)
1141 : {
1142 220498 : epoll_ev.events |= EPOLLIN;
1143 : }
1144 : else
1145 : {
1146 : Assert(event->fd != PGINVALID_SOCKET);
1147 : Assert(event->events & (WL_SOCKET_READABLE |
1148 : WL_SOCKET_WRITEABLE |
1149 : WL_SOCKET_CLOSED));
1150 :
1151 209758 : if (event->events & WL_SOCKET_READABLE)
1152 186018 : epoll_ev.events |= EPOLLIN;
1153 209758 : if (event->events & WL_SOCKET_WRITEABLE)
1154 24264 : epoll_ev.events |= EPOLLOUT;
1155 209758 : if (event->events & WL_SOCKET_CLOSED)
1156 0 : epoll_ev.events |= EPOLLRDHUP;
1157 : }
1158 :
1159 : /*
1160 : * Even though unused, we also pass epoll_ev as the data argument if
1161 : * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
1162 : * requiring that, and actually it makes the code simpler...
1163 : */
1164 653588 : rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
1165 :
1166 653588 : if (rc < 0)
1167 0 : ereport(ERROR,
1168 : (errcode_for_socket_access(),
1169 : errmsg("%s() failed: %m",
1170 : "epoll_ctl")));
1171 653588 : }
1172 : #endif
1173 :
1174 : #if defined(WAIT_USE_POLL)
1175 : static void
1176 : WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
1177 : {
1178 : struct pollfd *pollfd = &set->pollfds[event->pos];
1179 :
1180 : pollfd->revents = 0;
1181 : pollfd->fd = event->fd;
1182 :
1183 : /* prepare pollfd entry once */
1184 : if (event->events == WL_LATCH_SET)
1185 : {
1186 : Assert(set->latch != NULL);
1187 : pollfd->events = POLLIN;
1188 : }
1189 : else if (event->events == WL_POSTMASTER_DEATH)
1190 : {
1191 : pollfd->events = POLLIN;
1192 : }
1193 : else
1194 : {
1195 : Assert(event->events & (WL_SOCKET_READABLE |
1196 : WL_SOCKET_WRITEABLE |
1197 : WL_SOCKET_CLOSED));
1198 : pollfd->events = 0;
1199 : if (event->events & WL_SOCKET_READABLE)
1200 : pollfd->events |= POLLIN;
1201 : if (event->events & WL_SOCKET_WRITEABLE)
1202 : pollfd->events |= POLLOUT;
1203 : #ifdef POLLRDHUP
1204 : if (event->events & WL_SOCKET_CLOSED)
1205 : pollfd->events |= POLLRDHUP;
1206 : #endif
1207 : }
1208 :
1209 : Assert(event->fd != PGINVALID_SOCKET);
1210 : }
1211 : #endif
1212 :
1213 : #if defined(WAIT_USE_KQUEUE)
1214 :
1215 : /*
1216 : * On most BSD family systems, the udata member of struct kevent is of type
1217 : * void *, so we could directly convert to/from WaitEvent *. Unfortunately,
1218 : * NetBSD has it as intptr_t, so here we wallpaper over that difference with
1219 : * an lvalue cast.
1220 : */
1221 : #define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
1222 :
1223 : static inline void
1224 : WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
1225 : WaitEvent *event)
1226 : {
1227 : k_ev->ident = event->fd;
1228 : k_ev->filter = filter;
1229 : k_ev->flags = action;
1230 : k_ev->fflags = 0;
1231 : k_ev->data = 0;
1232 : AccessWaitEvent(k_ev) = event;
1233 : }
1234 :
1235 : static inline void
1236 : WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
1237 : {
1238 : /* For now postmaster death can only be added, not removed. */
1239 : k_ev->ident = PostmasterPid;
1240 : k_ev->filter = EVFILT_PROC;
1241 : k_ev->flags = EV_ADD;
1242 : k_ev->fflags = NOTE_EXIT;
1243 : k_ev->data = 0;
1244 : AccessWaitEvent(k_ev) = event;
1245 : }
1246 :
1247 : static inline void
1248 : WaitEventAdjustKqueueAddLatch(struct kevent *k_ev, WaitEvent *event)
1249 : {
1250 : /* For now latch can only be added, not removed. */
1251 : k_ev->ident = SIGURG;
1252 : k_ev->filter = EVFILT_SIGNAL;
1253 : k_ev->flags = EV_ADD;
1254 : k_ev->fflags = 0;
1255 : k_ev->data = 0;
1256 : AccessWaitEvent(k_ev) = event;
1257 : }
1258 :
1259 : /*
1260 : * old_events is the previous event mask, used to compute what has changed.
1261 : */
1262 : static void
1263 : WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
1264 : {
1265 : int rc;
1266 : struct kevent k_ev[2];
1267 : int count = 0;
1268 : bool new_filt_read = false;
1269 : bool old_filt_read = false;
1270 : bool new_filt_write = false;
1271 : bool old_filt_write = false;
1272 :
1273 : if (old_events == event->events)
1274 : return;
1275 :
1276 : Assert(event->events != WL_LATCH_SET || set->latch != NULL);
1277 : Assert(event->events == WL_LATCH_SET ||
1278 : event->events == WL_POSTMASTER_DEATH ||
1279 : (event->events & (WL_SOCKET_READABLE |
1280 : WL_SOCKET_WRITEABLE |
1281 : WL_SOCKET_CLOSED)));
1282 :
1283 : if (event->events == WL_POSTMASTER_DEATH)
1284 : {
1285 : /*
1286 : * Unlike all the other implementations, we detect postmaster death
1287 : * using process notification instead of waiting on the postmaster
1288 : * alive pipe.
1289 : */
1290 : WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
1291 : }
1292 : else if (event->events == WL_LATCH_SET)
1293 : {
1294 : /* We detect latch wakeup using a signal event. */
1295 : WaitEventAdjustKqueueAddLatch(&k_ev[count++], event);
1296 : }
1297 : else
1298 : {
1299 : /*
1300 : * We need to compute the adds and deletes required to get from the
1301 : * old event mask to the new event mask, since kevent treats readable
1302 : * and writable as separate events.
1303 : */
1304 : if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
1305 : old_filt_read = true;
1306 : if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
1307 : new_filt_read = true;
1308 : if (old_events & WL_SOCKET_WRITEABLE)
1309 : old_filt_write = true;
1310 : if (event->events & WL_SOCKET_WRITEABLE)
1311 : new_filt_write = true;
1312 : if (old_filt_read && !new_filt_read)
1313 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
1314 : event);
1315 : else if (!old_filt_read && new_filt_read)
1316 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
1317 : event);
1318 : if (old_filt_write && !new_filt_write)
1319 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
1320 : event);
1321 : else if (!old_filt_write && new_filt_write)
1322 : WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
1323 : event);
1324 : }
1325 :
1326 : /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
1327 : if (count == 0)
1328 : return;
1329 :
1330 : Assert(count <= 2);
1331 :
1332 : rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
1333 :
1334 : /*
1335 : * When adding the postmaster's pid, we have to consider that it might
1336 : * already have exited and perhaps even been replaced by another process
1337 : * with the same pid. If so, we have to defer reporting this as an event
1338 : * until the next call to WaitEventSetWaitBlock().
1339 : */
1340 :
1341 : if (rc < 0)
1342 : {
1343 : if (event->events == WL_POSTMASTER_DEATH &&
1344 : (errno == ESRCH || errno == EACCES))
1345 : set->report_postmaster_not_running = true;
1346 : else
1347 : ereport(ERROR,
1348 : (errcode_for_socket_access(),
1349 : errmsg("%s() failed: %m",
1350 : "kevent")));
1351 : }
1352 : else if (event->events == WL_POSTMASTER_DEATH &&
1353 : PostmasterPid != getppid() &&
1354 : !PostmasterIsAlive())
1355 : {
1356 : /*
1357 : * The extra PostmasterIsAliveInternal() check prevents false alarms
1358 : * on systems that give a different value for getppid() while being
1359 : * traced by a debugger.
1360 : */
1361 : set->report_postmaster_not_running = true;
1362 : }
1363 : }
1364 :
1365 : #endif
1366 :
1367 : #if defined(WAIT_USE_WIN32)
1368 : static void
1369 : WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
1370 : {
1371 : HANDLE *handle = &set->handles[event->pos + 1];
1372 :
1373 : if (event->events == WL_LATCH_SET)
1374 : {
1375 : Assert(set->latch != NULL);
1376 : *handle = set->latch->event;
1377 : }
1378 : else if (event->events == WL_POSTMASTER_DEATH)
1379 : {
1380 : *handle = PostmasterHandle;
1381 : }
1382 : else
1383 : {
1384 : int flags = FD_CLOSE; /* always check for errors/EOF */
1385 :
1386 : if (event->events & WL_SOCKET_READABLE)
1387 : flags |= FD_READ;
1388 : if (event->events & WL_SOCKET_WRITEABLE)
1389 : flags |= FD_WRITE;
1390 : if (event->events & WL_SOCKET_CONNECTED)
1391 : flags |= FD_CONNECT;
1392 : if (event->events & WL_SOCKET_ACCEPT)
1393 : flags |= FD_ACCEPT;
1394 :
1395 : if (*handle == WSA_INVALID_EVENT)
1396 : {
1397 : *handle = WSACreateEvent();
1398 : if (*handle == WSA_INVALID_EVENT)
1399 : elog(ERROR, "failed to create event for socket: error code %d",
1400 : WSAGetLastError());
1401 : }
1402 : if (WSAEventSelect(event->fd, *handle, flags) != 0)
1403 : elog(ERROR, "failed to set up event for socket: error code %d",
1404 : WSAGetLastError());
1405 :
1406 : Assert(event->fd != PGINVALID_SOCKET);
1407 : }
1408 : }
1409 : #endif
1410 :
1411 : /*
1412 : * Wait for events added to the set to happen, or until the timeout is
1413 : * reached. At most nevents occurred events are returned.
1414 : *
1415 : * If timeout = -1, block until an event occurs; if 0, check sockets for
1416 : * readiness, but don't block; if > 0, block for at most timeout milliseconds.
1417 : *
1418 : * Returns the number of events occurred, or 0 if the timeout was reached.
1419 : *
1420 : * Returned events will have the fd, pos, user_data fields set to the
1421 : * values associated with the registered event.
1422 : */
1423 : int
1424 1406934 : WaitEventSetWait(WaitEventSet *set, long timeout,
1425 : WaitEvent *occurred_events, int nevents,
1426 : uint32 wait_event_info)
1427 : {
1428 1406934 : int returned_events = 0;
1429 : instr_time start_time;
1430 : instr_time cur_time;
1431 1406934 : long cur_timeout = -1;
1432 :
1433 : Assert(nevents > 0);
1434 :
1435 : /*
1436 : * Initialize timeout if requested. We must record the current time so
1437 : * that we can determine the remaining timeout if interrupted.
1438 : */
1439 1406934 : if (timeout >= 0)
1440 : {
1441 381290 : INSTR_TIME_SET_CURRENT(start_time);
1442 : Assert(timeout >= 0 && timeout <= INT_MAX);
1443 381290 : cur_timeout = timeout;
1444 : }
1445 : else
1446 1025644 : INSTR_TIME_SET_ZERO(start_time);
1447 :
1448 1406934 : pgstat_report_wait_start(wait_event_info);
1449 :
1450 : #ifndef WIN32
1451 1406934 : waiting = true;
1452 : #else
1453 : /* Ensure that signals are serviced even if latch is already set */
1454 : pgwin32_dispatch_queued_signals();
1455 : #endif
1456 2712448 : while (returned_events == 0)
1457 : {
1458 : int rc;
1459 :
1460 : /*
1461 : * Check if the latch is set already. If so, leave the loop
1462 : * immediately, avoid blocking again. We don't attempt to report any
1463 : * other events that might also be satisfied.
1464 : *
1465 : * If someone sets the latch between this and the
1466 : * WaitEventSetWaitBlock() below, the setter will write a byte to the
1467 : * pipe (or signal us and the signal handler will do that), and the
1468 : * readiness routine will return immediately.
1469 : *
1470 : * On unix, If there's a pending byte in the self pipe, we'll notice
1471 : * whenever blocking. Only clearing the pipe in that case avoids
1472 : * having to drain it every time WaitLatchOrSocket() is used. Should
1473 : * the pipe-buffer fill up we're still ok, because the pipe is in
1474 : * nonblocking mode. It's unlikely for that to happen, because the
1475 : * self pipe isn't filled unless we're blocking (waiting = true), or
1476 : * from inside a signal handler in latch_sigurg_handler().
1477 : *
1478 : * On windows, we'll also notice if there's a pending event for the
1479 : * latch when blocking, but there's no danger of anything filling up,
1480 : * as "Setting an event that is already set has no effect.".
1481 : *
1482 : * Note: we assume that the kernel calls involved in latch management
1483 : * will provide adequate synchronization on machines with weak memory
1484 : * ordering, so that we cannot miss seeing is_set if a notification
1485 : * has already been queued.
1486 : */
1487 1492910 : if (set->latch && !set->latch->is_set)
1488 : {
1489 : /* about to sleep on a latch */
1490 1334080 : set->latch->maybe_sleeping = true;
1491 1334080 : pg_memory_barrier();
1492 : /* and recheck */
1493 : }
1494 :
1495 1492910 : if (set->latch && set->latch->is_set)
1496 : {
1497 158410 : occurred_events->fd = PGINVALID_SOCKET;
1498 158410 : occurred_events->pos = set->latch_pos;
1499 158410 : occurred_events->user_data =
1500 158410 : set->events[set->latch_pos].user_data;
1501 158410 : occurred_events->events = WL_LATCH_SET;
1502 158410 : occurred_events++;
1503 158410 : returned_events++;
1504 :
1505 : /* could have been set above */
1506 158410 : set->latch->maybe_sleeping = false;
1507 :
1508 158410 : break;
1509 : }
1510 :
1511 : /*
1512 : * Wait for events using the readiness primitive chosen at the top of
1513 : * this file. If -1 is returned, a timeout has occurred, if 0 we have
1514 : * to retry, everything >= 1 is the number of returned events.
1515 : */
1516 1334500 : rc = WaitEventSetWaitBlock(set, cur_timeout,
1517 : occurred_events, nevents);
1518 :
1519 1334470 : if (set->latch)
1520 : {
1521 : Assert(set->latch->maybe_sleeping);
1522 1334026 : set->latch->maybe_sleeping = false;
1523 : }
1524 :
1525 1334470 : if (rc == -1)
1526 28944 : break; /* timeout occurred */
1527 : else
1528 1305526 : returned_events = rc;
1529 :
1530 : /* If we're not done, update cur_timeout for next iteration */
1531 1305526 : if (returned_events == 0 && timeout >= 0)
1532 : {
1533 70334 : INSTR_TIME_SET_CURRENT(cur_time);
1534 70334 : INSTR_TIME_SUBTRACT(cur_time, start_time);
1535 70334 : cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1536 70334 : if (cur_timeout <= 0)
1537 12 : break;
1538 : }
1539 : }
1540 : #ifndef WIN32
1541 1406904 : waiting = false;
1542 : #endif
1543 :
1544 1406904 : pgstat_report_wait_end();
1545 :
1546 1406904 : return returned_events;
1547 : }
1548 :
1549 :
1550 : #if defined(WAIT_USE_EPOLL)
1551 :
1552 : /*
1553 : * Wait using linux's epoll_wait(2).
1554 : *
1555 : * This is the preferable wait method, as several readiness notifications are
1556 : * delivered, without having to iterate through all of set->events. The return
1557 : * epoll_event struct contain a pointer to our events, making association
1558 : * easy.
1559 : */
1560 : static inline int
1561 1334500 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1562 : WaitEvent *occurred_events, int nevents)
1563 : {
1564 1334500 : int returned_events = 0;
1565 : int rc;
1566 : WaitEvent *cur_event;
1567 : struct epoll_event *cur_epoll_event;
1568 :
1569 : /* Sleep */
1570 1334500 : rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1571 1334500 : Min(nevents, set->nevents_space), cur_timeout);
1572 :
1573 : /* Check return code */
1574 1334500 : if (rc < 0)
1575 : {
1576 : /* EINTR is okay, otherwise complain */
1577 45542 : if (errno != EINTR)
1578 : {
1579 0 : waiting = false;
1580 0 : ereport(ERROR,
1581 : (errcode_for_socket_access(),
1582 : errmsg("%s() failed: %m",
1583 : "epoll_wait")));
1584 : }
1585 45542 : return 0;
1586 : }
1587 1288958 : else if (rc == 0)
1588 : {
1589 : /* timeout exceeded */
1590 28944 : return -1;
1591 : }
1592 :
1593 : /*
1594 : * At least one event occurred, iterate over the returned epoll events
1595 : * until they're either all processed, or we've returned all the events
1596 : * the caller desired.
1597 : */
1598 1260014 : for (cur_epoll_event = set->epoll_ret_events;
1599 2520524 : cur_epoll_event < (set->epoll_ret_events + rc) &&
1600 : returned_events < nevents;
1601 1260510 : cur_epoll_event++)
1602 : {
1603 : /* epoll's data pointer is set to the associated WaitEvent */
1604 1260540 : cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1605 :
1606 1260540 : occurred_events->pos = cur_event->pos;
1607 1260540 : occurred_events->user_data = cur_event->user_data;
1608 1260540 : occurred_events->events = 0;
1609 :
1610 1260540 : if (cur_event->events == WL_LATCH_SET &&
1611 781260 : cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1612 : {
1613 : /* Drain the signalfd. */
1614 781260 : drain();
1615 :
1616 781260 : if (set->latch && set->latch->is_set)
1617 : {
1618 740358 : occurred_events->fd = PGINVALID_SOCKET;
1619 740358 : occurred_events->events = WL_LATCH_SET;
1620 740358 : occurred_events++;
1621 740358 : returned_events++;
1622 : }
1623 : }
1624 479280 : else if (cur_event->events == WL_POSTMASTER_DEATH &&
1625 30 : cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1626 : {
1627 : /*
1628 : * We expect an EPOLLHUP when the remote end is closed, but
1629 : * because we don't expect the pipe to become readable or to have
1630 : * any errors either, treat those cases as postmaster death, too.
1631 : *
1632 : * Be paranoid about a spurious event signaling the postmaster as
1633 : * being dead. There have been reports about that happening with
1634 : * older primitives (select(2) to be specific), and a spurious
1635 : * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1636 : * cost much.
1637 : */
1638 30 : if (!PostmasterIsAliveInternal())
1639 : {
1640 30 : if (set->exit_on_postmaster_death)
1641 30 : proc_exit(1);
1642 0 : occurred_events->fd = PGINVALID_SOCKET;
1643 0 : occurred_events->events = WL_POSTMASTER_DEATH;
1644 0 : occurred_events++;
1645 0 : returned_events++;
1646 : }
1647 : }
1648 479250 : else if (cur_event->events & (WL_SOCKET_READABLE |
1649 : WL_SOCKET_WRITEABLE |
1650 : WL_SOCKET_CLOSED))
1651 : {
1652 : Assert(cur_event->fd != PGINVALID_SOCKET);
1653 :
1654 479250 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1655 456154 : (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1656 : {
1657 : /* data available in socket, or EOF */
1658 452672 : occurred_events->events |= WL_SOCKET_READABLE;
1659 : }
1660 :
1661 479250 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1662 27164 : (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1663 : {
1664 : /* writable, or EOF */
1665 27022 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1666 : }
1667 :
1668 479250 : if ((cur_event->events & WL_SOCKET_CLOSED) &&
1669 0 : (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
1670 : {
1671 : /* remote peer shut down, or error */
1672 0 : occurred_events->events |= WL_SOCKET_CLOSED;
1673 : }
1674 :
1675 479250 : if (occurred_events->events != 0)
1676 : {
1677 479250 : occurred_events->fd = cur_event->fd;
1678 479250 : occurred_events++;
1679 479250 : returned_events++;
1680 : }
1681 : }
1682 : }
1683 :
1684 1259984 : return returned_events;
1685 : }
1686 :
1687 : #elif defined(WAIT_USE_KQUEUE)
1688 :
1689 : /*
1690 : * Wait using kevent(2) on BSD-family systems and macOS.
1691 : *
1692 : * For now this mirrors the epoll code, but in future it could modify the fd
1693 : * set in the same call to kevent as it uses for waiting instead of doing that
1694 : * with separate system calls.
1695 : */
1696 : static int
1697 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1698 : WaitEvent *occurred_events, int nevents)
1699 : {
1700 : int returned_events = 0;
1701 : int rc;
1702 : WaitEvent *cur_event;
1703 : struct kevent *cur_kqueue_event;
1704 : struct timespec timeout;
1705 : struct timespec *timeout_p;
1706 :
1707 : if (cur_timeout < 0)
1708 : timeout_p = NULL;
1709 : else
1710 : {
1711 : timeout.tv_sec = cur_timeout / 1000;
1712 : timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
1713 : timeout_p = &timeout;
1714 : }
1715 :
1716 : /*
1717 : * Report postmaster events discovered by WaitEventAdjustKqueue() or an
1718 : * earlier call to WaitEventSetWait().
1719 : */
1720 : if (unlikely(set->report_postmaster_not_running))
1721 : {
1722 : if (set->exit_on_postmaster_death)
1723 : proc_exit(1);
1724 : occurred_events->fd = PGINVALID_SOCKET;
1725 : occurred_events->events = WL_POSTMASTER_DEATH;
1726 : return 1;
1727 : }
1728 :
1729 : /* Sleep */
1730 : rc = kevent(set->kqueue_fd, NULL, 0,
1731 : set->kqueue_ret_events,
1732 : Min(nevents, set->nevents_space),
1733 : timeout_p);
1734 :
1735 : /* Check return code */
1736 : if (rc < 0)
1737 : {
1738 : /* EINTR is okay, otherwise complain */
1739 : if (errno != EINTR)
1740 : {
1741 : waiting = false;
1742 : ereport(ERROR,
1743 : (errcode_for_socket_access(),
1744 : errmsg("%s() failed: %m",
1745 : "kevent")));
1746 : }
1747 : return 0;
1748 : }
1749 : else if (rc == 0)
1750 : {
1751 : /* timeout exceeded */
1752 : return -1;
1753 : }
1754 :
1755 : /*
1756 : * At least one event occurred, iterate over the returned kqueue events
1757 : * until they're either all processed, or we've returned all the events
1758 : * the caller desired.
1759 : */
1760 : for (cur_kqueue_event = set->kqueue_ret_events;
1761 : cur_kqueue_event < (set->kqueue_ret_events + rc) &&
1762 : returned_events < nevents;
1763 : cur_kqueue_event++)
1764 : {
1765 : /* kevent's udata points to the associated WaitEvent */
1766 : cur_event = AccessWaitEvent(cur_kqueue_event);
1767 :
1768 : occurred_events->pos = cur_event->pos;
1769 : occurred_events->user_data = cur_event->user_data;
1770 : occurred_events->events = 0;
1771 :
1772 : if (cur_event->events == WL_LATCH_SET &&
1773 : cur_kqueue_event->filter == EVFILT_SIGNAL)
1774 : {
1775 : if (set->latch && set->latch->is_set)
1776 : {
1777 : occurred_events->fd = PGINVALID_SOCKET;
1778 : occurred_events->events = WL_LATCH_SET;
1779 : occurred_events++;
1780 : returned_events++;
1781 : }
1782 : }
1783 : else if (cur_event->events == WL_POSTMASTER_DEATH &&
1784 : cur_kqueue_event->filter == EVFILT_PROC &&
1785 : (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
1786 : {
1787 : /*
1788 : * The kernel will tell this kqueue object only once about the
1789 : * exit of the postmaster, so let's remember that for next time so
1790 : * that we provide level-triggered semantics.
1791 : */
1792 : set->report_postmaster_not_running = true;
1793 :
1794 : if (set->exit_on_postmaster_death)
1795 : proc_exit(1);
1796 : occurred_events->fd = PGINVALID_SOCKET;
1797 : occurred_events->events = WL_POSTMASTER_DEATH;
1798 : occurred_events++;
1799 : returned_events++;
1800 : }
1801 : else if (cur_event->events & (WL_SOCKET_READABLE |
1802 : WL_SOCKET_WRITEABLE |
1803 : WL_SOCKET_CLOSED))
1804 : {
1805 : Assert(cur_event->fd >= 0);
1806 :
1807 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1808 : (cur_kqueue_event->filter == EVFILT_READ))
1809 : {
1810 : /* readable, or EOF */
1811 : occurred_events->events |= WL_SOCKET_READABLE;
1812 : }
1813 :
1814 : if ((cur_event->events & WL_SOCKET_CLOSED) &&
1815 : (cur_kqueue_event->filter == EVFILT_READ) &&
1816 : (cur_kqueue_event->flags & EV_EOF))
1817 : {
1818 : /* the remote peer has shut down */
1819 : occurred_events->events |= WL_SOCKET_CLOSED;
1820 : }
1821 :
1822 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1823 : (cur_kqueue_event->filter == EVFILT_WRITE))
1824 : {
1825 : /* writable, or EOF */
1826 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1827 : }
1828 :
1829 : if (occurred_events->events != 0)
1830 : {
1831 : occurred_events->fd = cur_event->fd;
1832 : occurred_events++;
1833 : returned_events++;
1834 : }
1835 : }
1836 : }
1837 :
1838 : return returned_events;
1839 : }
1840 :
1841 : #elif defined(WAIT_USE_POLL)
1842 :
1843 : /*
1844 : * Wait using poll(2).
1845 : *
1846 : * This allows to receive readiness notifications for several events at once,
1847 : * but requires iterating through all of set->pollfds.
1848 : */
1849 : static inline int
1850 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1851 : WaitEvent *occurred_events, int nevents)
1852 : {
1853 : int returned_events = 0;
1854 : int rc;
1855 : WaitEvent *cur_event;
1856 : struct pollfd *cur_pollfd;
1857 :
1858 : /* Sleep */
1859 : rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1860 :
1861 : /* Check return code */
1862 : if (rc < 0)
1863 : {
1864 : /* EINTR is okay, otherwise complain */
1865 : if (errno != EINTR)
1866 : {
1867 : waiting = false;
1868 : ereport(ERROR,
1869 : (errcode_for_socket_access(),
1870 : errmsg("%s() failed: %m",
1871 : "poll")));
1872 : }
1873 : return 0;
1874 : }
1875 : else if (rc == 0)
1876 : {
1877 : /* timeout exceeded */
1878 : return -1;
1879 : }
1880 :
1881 : for (cur_event = set->events, cur_pollfd = set->pollfds;
1882 : cur_event < (set->events + set->nevents) &&
1883 : returned_events < nevents;
1884 : cur_event++, cur_pollfd++)
1885 : {
1886 : /* no activity on this FD, skip */
1887 : if (cur_pollfd->revents == 0)
1888 : continue;
1889 :
1890 : occurred_events->pos = cur_event->pos;
1891 : occurred_events->user_data = cur_event->user_data;
1892 : occurred_events->events = 0;
1893 :
1894 : if (cur_event->events == WL_LATCH_SET &&
1895 : (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1896 : {
1897 : /* There's data in the self-pipe, clear it. */
1898 : drain();
1899 :
1900 : if (set->latch && set->latch->is_set)
1901 : {
1902 : occurred_events->fd = PGINVALID_SOCKET;
1903 : occurred_events->events = WL_LATCH_SET;
1904 : occurred_events++;
1905 : returned_events++;
1906 : }
1907 : }
1908 : else if (cur_event->events == WL_POSTMASTER_DEATH &&
1909 : (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1910 : {
1911 : /*
1912 : * We expect an POLLHUP when the remote end is closed, but because
1913 : * we don't expect the pipe to become readable or to have any
1914 : * errors either, treat those cases as postmaster death, too.
1915 : *
1916 : * Be paranoid about a spurious event signaling the postmaster as
1917 : * being dead. There have been reports about that happening with
1918 : * older primitives (select(2) to be specific), and a spurious
1919 : * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1920 : * cost much.
1921 : */
1922 : if (!PostmasterIsAliveInternal())
1923 : {
1924 : if (set->exit_on_postmaster_death)
1925 : proc_exit(1);
1926 : occurred_events->fd = PGINVALID_SOCKET;
1927 : occurred_events->events = WL_POSTMASTER_DEATH;
1928 : occurred_events++;
1929 : returned_events++;
1930 : }
1931 : }
1932 : else if (cur_event->events & (WL_SOCKET_READABLE |
1933 : WL_SOCKET_WRITEABLE |
1934 : WL_SOCKET_CLOSED))
1935 : {
1936 : int errflags = POLLHUP | POLLERR | POLLNVAL;
1937 :
1938 : Assert(cur_event->fd >= PGINVALID_SOCKET);
1939 :
1940 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1941 : (cur_pollfd->revents & (POLLIN | errflags)))
1942 : {
1943 : /* data available in socket, or EOF */
1944 : occurred_events->events |= WL_SOCKET_READABLE;
1945 : }
1946 :
1947 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1948 : (cur_pollfd->revents & (POLLOUT | errflags)))
1949 : {
1950 : /* writeable, or EOF */
1951 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1952 : }
1953 :
1954 : #ifdef POLLRDHUP
1955 : if ((cur_event->events & WL_SOCKET_CLOSED) &&
1956 : (cur_pollfd->revents & (POLLRDHUP | errflags)))
1957 : {
1958 : /* remote peer closed, or error */
1959 : occurred_events->events |= WL_SOCKET_CLOSED;
1960 : }
1961 : #endif
1962 :
1963 : if (occurred_events->events != 0)
1964 : {
1965 : occurred_events->fd = cur_event->fd;
1966 : occurred_events++;
1967 : returned_events++;
1968 : }
1969 : }
1970 : }
1971 : return returned_events;
1972 : }
1973 :
1974 : #elif defined(WAIT_USE_WIN32)
1975 :
1976 : /*
1977 : * Wait using Windows' WaitForMultipleObjects(). Each call only "consumes" one
1978 : * event, so we keep calling until we've filled up our output buffer to match
1979 : * the behavior of the other implementations.
1980 : *
1981 : * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273
1982 : */
1983 : static inline int
1984 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1985 : WaitEvent *occurred_events, int nevents)
1986 : {
1987 : int returned_events = 0;
1988 : DWORD rc;
1989 : WaitEvent *cur_event;
1990 :
1991 : /* Reset any wait events that need it */
1992 : for (cur_event = set->events;
1993 : cur_event < (set->events + set->nevents);
1994 : cur_event++)
1995 : {
1996 : if (cur_event->reset)
1997 : {
1998 : WaitEventAdjustWin32(set, cur_event);
1999 : cur_event->reset = false;
2000 : }
2001 :
2002 : /*
2003 : * Windows does not guarantee to log an FD_WRITE network event
2004 : * indicating that more data can be sent unless the previous send()
2005 : * failed with WSAEWOULDBLOCK. While our caller might well have made
2006 : * such a call, we cannot assume that here. Therefore, if waiting for
2007 : * write-ready, force the issue by doing a dummy send(). If the dummy
2008 : * send() succeeds, assume that the socket is in fact write-ready, and
2009 : * return immediately. Also, if it fails with something other than
2010 : * WSAEWOULDBLOCK, return a write-ready indication to let our caller
2011 : * deal with the error condition.
2012 : */
2013 : if (cur_event->events & WL_SOCKET_WRITEABLE)
2014 : {
2015 : char c;
2016 : WSABUF buf;
2017 : DWORD sent;
2018 : int r;
2019 :
2020 : buf.buf = &c;
2021 : buf.len = 0;
2022 :
2023 : r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
2024 : if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
2025 : {
2026 : occurred_events->pos = cur_event->pos;
2027 : occurred_events->user_data = cur_event->user_data;
2028 : occurred_events->events = WL_SOCKET_WRITEABLE;
2029 : occurred_events->fd = cur_event->fd;
2030 : return 1;
2031 : }
2032 : }
2033 : }
2034 :
2035 : /*
2036 : * Sleep.
2037 : *
2038 : * Need to wait for ->nevents + 1, because signal handle is in [0].
2039 : */
2040 : rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
2041 : cur_timeout);
2042 :
2043 : /* Check return code */
2044 : if (rc == WAIT_FAILED)
2045 : elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
2046 : GetLastError());
2047 : else if (rc == WAIT_TIMEOUT)
2048 : {
2049 : /* timeout exceeded */
2050 : return -1;
2051 : }
2052 :
2053 : if (rc == WAIT_OBJECT_0)
2054 : {
2055 : /* Service newly-arrived signals */
2056 : pgwin32_dispatch_queued_signals();
2057 : return 0; /* retry */
2058 : }
2059 :
2060 : /*
2061 : * With an offset of one, due to the always present pgwin32_signal_event,
2062 : * the handle offset directly corresponds to a wait event.
2063 : */
2064 : cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
2065 :
2066 : for (;;)
2067 : {
2068 : int next_pos;
2069 : int count;
2070 :
2071 : occurred_events->pos = cur_event->pos;
2072 : occurred_events->user_data = cur_event->user_data;
2073 : occurred_events->events = 0;
2074 :
2075 : if (cur_event->events == WL_LATCH_SET)
2076 : {
2077 : /*
2078 : * We cannot use set->latch->event to reset the fired event if we
2079 : * aren't waiting on this latch now.
2080 : */
2081 : if (!ResetEvent(set->handles[cur_event->pos + 1]))
2082 : elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
2083 :
2084 : if (set->latch && set->latch->is_set)
2085 : {
2086 : occurred_events->fd = PGINVALID_SOCKET;
2087 : occurred_events->events = WL_LATCH_SET;
2088 : occurred_events++;
2089 : returned_events++;
2090 : }
2091 : }
2092 : else if (cur_event->events == WL_POSTMASTER_DEATH)
2093 : {
2094 : /*
2095 : * Postmaster apparently died. Since the consequences of falsely
2096 : * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we
2097 : * take the trouble to positively verify this with
2098 : * PostmasterIsAlive(), even though there is no known reason to
2099 : * think that the event could be falsely set on Windows.
2100 : */
2101 : if (!PostmasterIsAliveInternal())
2102 : {
2103 : if (set->exit_on_postmaster_death)
2104 : proc_exit(1);
2105 : occurred_events->fd = PGINVALID_SOCKET;
2106 : occurred_events->events = WL_POSTMASTER_DEATH;
2107 : occurred_events++;
2108 : returned_events++;
2109 : }
2110 : }
2111 : else if (cur_event->events & WL_SOCKET_MASK)
2112 : {
2113 : WSANETWORKEVENTS resEvents;
2114 : HANDLE handle = set->handles[cur_event->pos + 1];
2115 :
2116 : Assert(cur_event->fd);
2117 :
2118 : occurred_events->fd = cur_event->fd;
2119 :
2120 : ZeroMemory(&resEvents, sizeof(resEvents));
2121 : if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
2122 : elog(ERROR, "failed to enumerate network events: error code %d",
2123 : WSAGetLastError());
2124 : if ((cur_event->events & WL_SOCKET_READABLE) &&
2125 : (resEvents.lNetworkEvents & FD_READ))
2126 : {
2127 : /* data available in socket */
2128 : occurred_events->events |= WL_SOCKET_READABLE;
2129 :
2130 : /*------
2131 : * WaitForMultipleObjects doesn't guarantee that a read event
2132 : * will be returned if the latch is set at the same time. Even
2133 : * if it did, the caller might drop that event expecting it to
2134 : * reoccur on next call. So, we must force the event to be
2135 : * reset if this WaitEventSet is used again in order to avoid
2136 : * an indefinite hang.
2137 : *
2138 : * Refer
2139 : * https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
2140 : * for the behavior of socket events.
2141 : *------
2142 : */
2143 : cur_event->reset = true;
2144 : }
2145 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
2146 : (resEvents.lNetworkEvents & FD_WRITE))
2147 : {
2148 : /* writeable */
2149 : occurred_events->events |= WL_SOCKET_WRITEABLE;
2150 : }
2151 : if ((cur_event->events & WL_SOCKET_CONNECTED) &&
2152 : (resEvents.lNetworkEvents & FD_CONNECT))
2153 : {
2154 : /* connected */
2155 : occurred_events->events |= WL_SOCKET_CONNECTED;
2156 : }
2157 : if ((cur_event->events & WL_SOCKET_ACCEPT) &&
2158 : (resEvents.lNetworkEvents & FD_ACCEPT))
2159 : {
2160 : /* incoming connection could be accepted */
2161 : occurred_events->events |= WL_SOCKET_ACCEPT;
2162 : }
2163 : if (resEvents.lNetworkEvents & FD_CLOSE)
2164 : {
2165 : /* EOF/error, so signal all caller-requested socket flags */
2166 : occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
2167 : }
2168 :
2169 : if (occurred_events->events != 0)
2170 : {
2171 : occurred_events++;
2172 : returned_events++;
2173 : }
2174 : }
2175 :
2176 : /* Is the output buffer full? */
2177 : if (returned_events == nevents)
2178 : break;
2179 :
2180 : /* Have we run out of possible events? */
2181 : next_pos = cur_event->pos + 1;
2182 : if (next_pos == set->nevents)
2183 : break;
2184 :
2185 : /*
2186 : * Poll the rest of the event handles in the array starting at
2187 : * next_pos being careful to skip over the initial signal handle too.
2188 : * This time we use a zero timeout.
2189 : */
2190 : count = set->nevents - next_pos;
2191 : rc = WaitForMultipleObjects(count,
2192 : set->handles + 1 + next_pos,
2193 : false,
2194 : 0);
2195 :
2196 : /*
2197 : * We don't distinguish between errors and WAIT_TIMEOUT here because
2198 : * we already have events to report.
2199 : */
2200 : if (rc < WAIT_OBJECT_0 || rc >= WAIT_OBJECT_0 + count)
2201 : break;
2202 :
2203 : /* We have another event to decode. */
2204 : cur_event = &set->events[next_pos + (rc - WAIT_OBJECT_0)];
2205 : }
2206 :
2207 : return returned_events;
2208 : }
2209 : #endif
2210 :
2211 : /*
2212 : * Return whether the current build options can report WL_SOCKET_CLOSED.
2213 : */
2214 : bool
2215 1830 : WaitEventSetCanReportClosed(void)
2216 : {
2217 : #if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
2218 : defined(WAIT_USE_EPOLL) || \
2219 : defined(WAIT_USE_KQUEUE)
2220 1830 : return true;
2221 : #else
2222 : return false;
2223 : #endif
2224 : }
2225 :
2226 : /*
2227 : * Get the number of wait events registered in a given WaitEventSet.
2228 : */
2229 : int
2230 248 : GetNumRegisteredWaitEvents(WaitEventSet *set)
2231 : {
2232 248 : return set->nevents;
2233 : }
2234 :
2235 : #if defined(WAIT_USE_SELF_PIPE)
2236 :
2237 : /*
2238 : * SetLatch uses SIGURG to wake up the process waiting on the latch.
2239 : *
2240 : * Wake up WaitLatch, if we're waiting.
2241 : */
2242 : static void
2243 : latch_sigurg_handler(SIGNAL_ARGS)
2244 : {
2245 : if (waiting)
2246 : sendSelfPipeByte();
2247 : }
2248 :
2249 : /* Send one byte to the self-pipe, to wake up WaitLatch */
2250 : static void
2251 : sendSelfPipeByte(void)
2252 : {
2253 : int rc;
2254 : char dummy = 0;
2255 :
2256 : retry:
2257 : rc = write(selfpipe_writefd, &dummy, 1);
2258 : if (rc < 0)
2259 : {
2260 : /* If interrupted by signal, just retry */
2261 : if (errno == EINTR)
2262 : goto retry;
2263 :
2264 : /*
2265 : * If the pipe is full, we don't need to retry, the data that's there
2266 : * already is enough to wake up WaitLatch.
2267 : */
2268 : if (errno == EAGAIN || errno == EWOULDBLOCK)
2269 : return;
2270 :
2271 : /*
2272 : * Oops, the write() failed for some other reason. We might be in a
2273 : * signal handler, so it's not safe to elog(). We have no choice but
2274 : * silently ignore the error.
2275 : */
2276 : return;
2277 : }
2278 : }
2279 :
2280 : #endif
2281 :
2282 : #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
2283 :
2284 : /*
2285 : * Read all available data from self-pipe or signalfd.
2286 : *
2287 : * Note: this is only called when waiting = true. If it fails and doesn't
2288 : * return, it must reset that flag first (though ideally, this will never
2289 : * happen).
2290 : */
2291 : static void
2292 781260 : drain(void)
2293 : {
2294 : char buf[1024];
2295 : int rc;
2296 : int fd;
2297 :
2298 : #ifdef WAIT_USE_SELF_PIPE
2299 : fd = selfpipe_readfd;
2300 : #else
2301 781260 : fd = signal_fd;
2302 : #endif
2303 :
2304 : for (;;)
2305 : {
2306 781260 : rc = read(fd, buf, sizeof(buf));
2307 781260 : if (rc < 0)
2308 : {
2309 0 : if (errno == EAGAIN || errno == EWOULDBLOCK)
2310 : break; /* the descriptor is empty */
2311 0 : else if (errno == EINTR)
2312 0 : continue; /* retry */
2313 : else
2314 : {
2315 0 : waiting = false;
2316 : #ifdef WAIT_USE_SELF_PIPE
2317 : elog(ERROR, "read() on self-pipe failed: %m");
2318 : #else
2319 0 : elog(ERROR, "read() on signalfd failed: %m");
2320 : #endif
2321 : }
2322 : }
2323 781260 : else if (rc == 0)
2324 : {
2325 0 : waiting = false;
2326 : #ifdef WAIT_USE_SELF_PIPE
2327 : elog(ERROR, "unexpected EOF on self-pipe");
2328 : #else
2329 0 : elog(ERROR, "unexpected EOF on signalfd");
2330 : #endif
2331 : }
2332 781260 : else if (rc < sizeof(buf))
2333 : {
2334 : /* we successfully drained the pipe; no need to read() again */
2335 781260 : break;
2336 : }
2337 : /* else buffer wasn't big enough, so read again */
2338 : }
2339 781260 : }
2340 :
2341 : #endif
2342 :
2343 : static void
2344 2 : ResOwnerReleaseWaitEventSet(Datum res)
2345 : {
2346 2 : WaitEventSet *set = (WaitEventSet *) DatumGetPointer(res);
2347 :
2348 : Assert(set->owner != NULL);
2349 2 : set->owner = NULL;
2350 2 : FreeWaitEventSet(set);
2351 2 : }
|