Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * walreceiver.c
4 : *
5 : * The WAL receiver process (walreceiver) is new as of Postgres 9.0. It
6 : * is the process in the standby server that takes charge of receiving
7 : * XLOG records from a primary server during streaming replication.
8 : *
9 : * When the startup process determines that it's time to start streaming,
10 : * it instructs postmaster to start walreceiver. Walreceiver first connects
11 : * to the primary server (it will be served by a walsender process
12 : * in the primary server), and then keeps receiving XLOG records and
13 : * writing them to the disk as long as the connection is alive. As XLOG
14 : * records are received and flushed to disk, it updates the
15 : * WalRcv->flushedUpto variable in shared memory, to inform the startup
16 : * process of how far it can proceed with XLOG replay.
17 : *
18 : * A WAL receiver cannot directly load GUC parameters used when establishing
19 : * its connection to the primary. Instead it relies on parameter values
20 : * that are passed down by the startup process when streaming is requested.
21 : * This applies, for example, to the replication slot and the connection
22 : * string to be used for the connection with the primary.
23 : *
24 : * If the primary server ends streaming, but doesn't disconnect, walreceiver
25 : * goes into "waiting" mode, and waits for the startup process to give new
26 : * instructions. The startup process will treat that the same as
27 : * disconnection, and will rescan the archive/pg_wal directory. But when the
28 : * startup process wants to try streaming replication again, it will just
29 : * nudge the existing walreceiver process that's waiting, instead of launching
30 : * a new one.
31 : *
32 : * Normal termination is by SIGTERM, which instructs the walreceiver to
33 : * exit(0). Emergency termination is by SIGQUIT; like any postmaster child
34 : * process, the walreceiver will simply abort and exit on SIGQUIT. A close
35 : * of the connection and a FATAL error are treated not as a crash but as
36 : * normal operation.
37 : *
38 : * This file contains the server-facing parts of walreceiver. The libpq-
39 : * specific parts are in the libpqwalreceiver module. It's loaded
40 : * dynamically to avoid linking the server with libpq.
41 : *
42 : * Portions Copyright (c) 2010-2026, PostgreSQL Global Development Group
43 : *
44 : *
45 : * IDENTIFICATION
46 : * src/backend/replication/walreceiver.c
47 : *
48 : *-------------------------------------------------------------------------
49 : */
50 : #include "postgres.h"
51 :
52 : #include <unistd.h>
53 :
54 : #include "access/htup_details.h"
55 : #include "access/timeline.h"
56 : #include "access/transam.h"
57 : #include "access/xlog_internal.h"
58 : #include "access/xlogarchive.h"
59 : #include "access/xlogrecovery.h"
60 : #include "access/xlogwait.h"
61 : #include "catalog/pg_authid.h"
62 : #include "funcapi.h"
63 : #include "libpq/pqformat.h"
64 : #include "libpq/pqsignal.h"
65 : #include "miscadmin.h"
66 : #include "pgstat.h"
67 : #include "postmaster/auxprocess.h"
68 : #include "postmaster/interrupt.h"
69 : #include "replication/walreceiver.h"
70 : #include "replication/walsender.h"
71 : #include "storage/ipc.h"
72 : #include "storage/proc.h"
73 : #include "storage/procarray.h"
74 : #include "storage/procsignal.h"
75 : #include "tcop/tcopprot.h"
76 : #include "utils/acl.h"
77 : #include "utils/builtins.h"
78 : #include "utils/guc.h"
79 : #include "utils/pg_lsn.h"
80 : #include "utils/ps_status.h"
81 : #include "utils/timestamp.h"
82 : #include "utils/wait_event.h"
83 :
84 :
85 : /*
86 : * GUC variables. (Other variables that affect walreceiver are in xlog.c
87 : * because they're passed down from the startup process, for better
88 : * synchronization.)
89 : */
90 : int wal_receiver_status_interval;
91 : int wal_receiver_timeout;
92 : bool hot_standby_feedback;
93 :
94 : /* libpqwalreceiver connection */
95 : static WalReceiverConn *wrconn = NULL;
96 : WalReceiverFunctionsType *WalReceiverFunctions = NULL;
97 :
98 : /*
99 : * These variables are used similarly to openLogFile/SegNo,
100 : * but for walreceiver to write the XLOG. recvFileTLI is the TimeLineID
101 : * corresponding the filename of recvFile.
102 : */
103 : static int recvFile = -1;
104 : static TimeLineID recvFileTLI = 0;
105 : static XLogSegNo recvSegNo = 0;
106 :
107 : /*
108 : * LogstreamResult indicates the byte positions that we have already
109 : * written/fsynced.
110 : */
111 : static struct
112 : {
113 : XLogRecPtr Write; /* last byte + 1 written out in the standby */
114 : XLogRecPtr Flush; /* last byte + 1 flushed in the standby */
115 : } LogstreamResult;
116 :
117 : /*
118 : * Reasons to wake up and perform periodic tasks.
119 : */
120 : typedef enum WalRcvWakeupReason
121 : {
122 : WALRCV_WAKEUP_TERMINATE,
123 : WALRCV_WAKEUP_PING,
124 : WALRCV_WAKEUP_REPLY,
125 : WALRCV_WAKEUP_HSFEEDBACK,
126 : #define NUM_WALRCV_WAKEUPS (WALRCV_WAKEUP_HSFEEDBACK + 1)
127 : } WalRcvWakeupReason;
128 :
129 : /*
130 : * Wake up times for periodic tasks.
131 : */
132 : static TimestampTz wakeup[NUM_WALRCV_WAKEUPS];
133 :
134 : static StringInfoData reply_message;
135 :
136 : /* Prototypes for private functions */
137 : static void WalRcvFetchTimeLineHistoryFiles(TimeLineID first, TimeLineID last);
138 : static void WalRcvWaitForStartPosition(XLogRecPtr *startpoint, TimeLineID *startpointTLI);
139 : static void WalRcvDie(int code, Datum arg);
140 : static void XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len,
141 : TimeLineID tli);
142 : static void XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr,
143 : TimeLineID tli);
144 : static void XLogWalRcvFlush(bool dying, TimeLineID tli);
145 : static void XLogWalRcvClose(XLogRecPtr recptr, TimeLineID tli);
146 : static void XLogWalRcvSendReply(bool force, bool requestReply, bool checkApply);
147 : static void XLogWalRcvSendHSFeedback(bool immed);
148 : static void ProcessWalSndrMessage(XLogRecPtr walEnd, TimestampTz sendTime);
149 : static void WalRcvComputeNextWakeup(WalRcvWakeupReason reason, TimestampTz now);
150 :
151 :
152 : /* Main entry point for walreceiver process */
153 : void
154 261 : WalReceiverMain(const void *startup_data, size_t startup_data_len)
155 : {
156 : char conninfo[MAXCONNINFO];
157 : char *tmp_conninfo;
158 : char slotname[NAMEDATALEN];
159 : bool is_temp_slot;
160 : XLogRecPtr startpoint;
161 : TimeLineID startpointTLI;
162 : TimeLineID primaryTLI;
163 : bool first_stream;
164 : WalRcvData *walrcv;
165 : TimestampTz now;
166 : char *err;
167 261 : char *sender_host = NULL;
168 261 : int sender_port = 0;
169 : char *appname;
170 :
171 : Assert(startup_data_len == 0);
172 :
173 261 : AuxiliaryProcessMainCommon();
174 :
175 : /*
176 : * WalRcv should be set up already (if we are a backend, we inherit this
177 : * by fork() or EXEC_BACKEND mechanism from the postmaster).
178 : */
179 261 : walrcv = WalRcv;
180 : Assert(walrcv != NULL);
181 :
182 : /*
183 : * Mark walreceiver as running in shared memory.
184 : *
185 : * Do this as early as possible, so that if we fail later on, we'll set
186 : * state to STOPPED. If we die before this, the startup process will keep
187 : * waiting for us to start up, until it times out.
188 : */
189 261 : SpinLockAcquire(&walrcv->mutex);
190 : Assert(walrcv->pid == 0);
191 261 : switch (walrcv->walRcvState)
192 : {
193 0 : case WALRCV_STOPPING:
194 : /* If we've already been requested to stop, don't start up. */
195 0 : walrcv->walRcvState = WALRCV_STOPPED;
196 : pg_fallthrough;
197 :
198 2 : case WALRCV_STOPPED:
199 2 : SpinLockRelease(&walrcv->mutex);
200 2 : ConditionVariableBroadcast(&walrcv->walRcvStoppedCV);
201 2 : proc_exit(1);
202 : break;
203 :
204 259 : case WALRCV_STARTING:
205 : /* The usual case */
206 259 : break;
207 :
208 0 : case WALRCV_CONNECTING:
209 : case WALRCV_WAITING:
210 : case WALRCV_STREAMING:
211 : case WALRCV_RESTARTING:
212 : default:
213 : /* Shouldn't happen */
214 0 : SpinLockRelease(&walrcv->mutex);
215 0 : elog(PANIC, "walreceiver still running according to shared memory state");
216 : }
217 : /* Advertise our PID so that the startup process can kill us */
218 259 : walrcv->pid = MyProcPid;
219 259 : walrcv->walRcvState = WALRCV_CONNECTING;
220 :
221 : /* Fetch information required to start streaming */
222 259 : walrcv->ready_to_display = false;
223 259 : strlcpy(conninfo, walrcv->conninfo, MAXCONNINFO);
224 259 : strlcpy(slotname, walrcv->slotname, NAMEDATALEN);
225 259 : is_temp_slot = walrcv->is_temp_slot;
226 259 : startpoint = walrcv->receiveStart;
227 259 : startpointTLI = walrcv->receiveStartTLI;
228 :
229 : /*
230 : * At most one of is_temp_slot and slotname can be set; otherwise,
231 : * RequestXLogStreaming messed up.
232 : */
233 : Assert(!is_temp_slot || (slotname[0] == '\0'));
234 :
235 : /* Initialise to a sanish value */
236 259 : now = GetCurrentTimestamp();
237 259 : walrcv->lastMsgSendTime =
238 259 : walrcv->lastMsgReceiptTime = walrcv->latestWalEndTime = now;
239 :
240 : /* Report our proc number so that others can wake us up */
241 259 : walrcv->procno = MyProcNumber;
242 :
243 259 : SpinLockRelease(&walrcv->mutex);
244 :
245 : /* Arrange to clean up at walreceiver exit */
246 259 : on_shmem_exit(WalRcvDie, PointerGetDatum(&startpointTLI));
247 :
248 : /* Properly accept or ignore signals the postmaster might send us */
249 259 : pqsignal(SIGHUP, SignalHandlerForConfigReload); /* set flag to read config
250 : * file */
251 259 : pqsignal(SIGINT, SIG_IGN);
252 259 : pqsignal(SIGTERM, die); /* request shutdown */
253 : /* SIGQUIT handler was already set up by InitPostmasterChild */
254 259 : pqsignal(SIGALRM, SIG_IGN);
255 259 : pqsignal(SIGPIPE, SIG_IGN);
256 259 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
257 259 : pqsignal(SIGUSR2, SIG_IGN);
258 :
259 : /* Reset some signals that are accepted by postmaster but not here */
260 259 : pqsignal(SIGCHLD, SIG_DFL);
261 :
262 : /* Load the libpq-specific functions */
263 259 : load_file("libpqwalreceiver", false);
264 259 : if (WalReceiverFunctions == NULL)
265 0 : elog(ERROR, "libpqwalreceiver didn't initialize correctly");
266 :
267 : /* Unblock signals (they were blocked when the postmaster forked us) */
268 259 : sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
269 :
270 : /* Establish the connection to the primary for XLOG streaming */
271 259 : appname = cluster_name[0] ? cluster_name : "walreceiver";
272 259 : wrconn = walrcv_connect(conninfo, true, false, false, appname, &err);
273 258 : if (!wrconn)
274 109 : ereport(ERROR,
275 : (errcode(ERRCODE_CONNECTION_FAILURE),
276 : errmsg("streaming replication receiver \"%s\" could not connect to the primary server: %s",
277 : appname, err)));
278 :
279 : /*
280 : * Save user-visible connection string. This clobbers the original
281 : * conninfo, for security. Also save host and port of the sender server
282 : * this walreceiver is connected to.
283 : */
284 149 : tmp_conninfo = walrcv_get_conninfo(wrconn);
285 149 : walrcv_get_senderinfo(wrconn, &sender_host, &sender_port);
286 149 : SpinLockAcquire(&walrcv->mutex);
287 149 : memset(walrcv->conninfo, 0, MAXCONNINFO);
288 149 : if (tmp_conninfo)
289 149 : strlcpy(walrcv->conninfo, tmp_conninfo, MAXCONNINFO);
290 :
291 149 : memset(walrcv->sender_host, 0, NI_MAXHOST);
292 149 : if (sender_host)
293 149 : strlcpy(walrcv->sender_host, sender_host, NI_MAXHOST);
294 :
295 149 : walrcv->sender_port = sender_port;
296 149 : walrcv->ready_to_display = true;
297 149 : SpinLockRelease(&walrcv->mutex);
298 :
299 149 : if (tmp_conninfo)
300 149 : pfree(tmp_conninfo);
301 :
302 149 : if (sender_host)
303 149 : pfree(sender_host);
304 :
305 149 : first_stream = true;
306 : for (;;)
307 12 : {
308 : char *primary_sysid;
309 : char standby_sysid[32];
310 : WalRcvStreamOptions options;
311 :
312 : /*
313 : * Check that we're connected to a valid server using the
314 : * IDENTIFY_SYSTEM replication command.
315 : */
316 161 : primary_sysid = walrcv_identify_system(wrconn, &primaryTLI);
317 :
318 161 : snprintf(standby_sysid, sizeof(standby_sysid), UINT64_FORMAT,
319 : GetSystemIdentifier());
320 161 : if (strcmp(primary_sysid, standby_sysid) != 0)
321 : {
322 0 : ereport(ERROR,
323 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
324 : errmsg("database system identifier differs between the primary and standby"),
325 : errdetail("The primary's identifier is %s, the standby's identifier is %s.",
326 : primary_sysid, standby_sysid)));
327 : }
328 :
329 : /*
330 : * Confirm that the current timeline of the primary is the same or
331 : * ahead of ours.
332 : */
333 161 : if (primaryTLI < startpointTLI)
334 0 : ereport(ERROR,
335 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
336 : errmsg("highest timeline %u of the primary is behind recovery timeline %u",
337 : primaryTLI, startpointTLI)));
338 :
339 : /*
340 : * Get any missing history files. We do this always, even when we're
341 : * not interested in that timeline, so that if we're promoted to
342 : * become the primary later on, we don't select the same timeline that
343 : * was already used in the current primary. This isn't bullet-proof -
344 : * you'll need some external software to manage your cluster if you
345 : * need to ensure that a unique timeline id is chosen in every case,
346 : * but let's avoid the confusion of timeline id collisions where we
347 : * can.
348 : */
349 161 : WalRcvFetchTimeLineHistoryFiles(startpointTLI, primaryTLI);
350 :
351 : /*
352 : * Create temporary replication slot if requested, and update slot
353 : * name in shared memory. (Note the slot name cannot already be set
354 : * in this case.)
355 : */
356 161 : if (is_temp_slot)
357 : {
358 0 : snprintf(slotname, sizeof(slotname),
359 : "pg_walreceiver_%lld",
360 0 : (long long int) walrcv_get_backend_pid(wrconn));
361 :
362 0 : walrcv_create_slot(wrconn, slotname, true, false, false, 0, NULL);
363 :
364 0 : SpinLockAcquire(&walrcv->mutex);
365 0 : strlcpy(walrcv->slotname, slotname, NAMEDATALEN);
366 0 : SpinLockRelease(&walrcv->mutex);
367 : }
368 :
369 : /*
370 : * Start streaming.
371 : *
372 : * We'll try to start at the requested starting point and timeline,
373 : * even if it's different from the server's latest timeline. In case
374 : * we've already reached the end of the old timeline, the server will
375 : * finish the streaming immediately, and we will go back to await
376 : * orders from the startup process. If recovery_target_timeline is
377 : * 'latest', the startup process will scan pg_wal and find the new
378 : * history file, bump recovery target timeline, and ask us to restart
379 : * on the new timeline.
380 : */
381 161 : options.logical = false;
382 161 : options.startpoint = startpoint;
383 161 : options.slotname = slotname[0] != '\0' ? slotname : NULL;
384 161 : options.proto.physical.startpointTLI = startpointTLI;
385 161 : if (walrcv_startstreaming(wrconn, &options))
386 : {
387 159 : if (first_stream)
388 147 : ereport(LOG,
389 : errmsg("started streaming WAL from primary at %X/%08X on timeline %u",
390 : LSN_FORMAT_ARGS(startpoint), startpointTLI));
391 : else
392 12 : ereport(LOG,
393 : errmsg("restarted WAL streaming at %X/%08X on timeline %u",
394 : LSN_FORMAT_ARGS(startpoint), startpointTLI));
395 159 : first_stream = false;
396 :
397 : /*
398 : * Switch to STREAMING after a successful connection if current
399 : * state is CONNECTING. This switch happens after an initial
400 : * startup, or after a restart as determined by
401 : * WalRcvWaitForStartPosition().
402 : */
403 159 : SpinLockAcquire(&walrcv->mutex);
404 159 : if (walrcv->walRcvState == WALRCV_CONNECTING)
405 159 : walrcv->walRcvState = WALRCV_STREAMING;
406 159 : SpinLockRelease(&walrcv->mutex);
407 :
408 : /* Initialize LogstreamResult and buffers for processing messages */
409 159 : LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr(NULL);
410 159 : initStringInfo(&reply_message);
411 :
412 : /* Initialize nap wakeup times. */
413 159 : now = GetCurrentTimestamp();
414 795 : for (int i = 0; i < NUM_WALRCV_WAKEUPS; ++i)
415 636 : WalRcvComputeNextWakeup(i, now);
416 :
417 : /* Send initial reply/feedback messages. */
418 159 : XLogWalRcvSendReply(true, false, false);
419 159 : XLogWalRcvSendHSFeedback(true);
420 :
421 : /* Loop until end-of-streaming or error */
422 : for (;;)
423 57836 : {
424 : char *buf;
425 : int len;
426 57995 : bool endofwal = false;
427 57995 : pgsocket wait_fd = PGINVALID_SOCKET;
428 : int rc;
429 : TimestampTz nextWakeup;
430 : long nap;
431 :
432 : /*
433 : * Exit walreceiver if we're not in recovery. This should not
434 : * happen, but cross-check the status here.
435 : */
436 57995 : if (!RecoveryInProgress())
437 0 : ereport(FATAL,
438 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
439 : errmsg("cannot continue WAL streaming, recovery has already ended")));
440 :
441 : /* Process any requests or signals received recently */
442 57995 : CHECK_FOR_INTERRUPTS();
443 :
444 57995 : if (ConfigReloadPending)
445 : {
446 28 : ConfigReloadPending = false;
447 28 : ProcessConfigFile(PGC_SIGHUP);
448 : /* recompute wakeup times */
449 28 : now = GetCurrentTimestamp();
450 140 : for (int i = 0; i < NUM_WALRCV_WAKEUPS; ++i)
451 112 : WalRcvComputeNextWakeup(i, now);
452 28 : XLogWalRcvSendHSFeedback(true);
453 : }
454 :
455 : /* See if we can read data immediately */
456 57995 : len = walrcv_receive(wrconn, &buf, &wait_fd);
457 57966 : if (len != 0)
458 : {
459 : /*
460 : * Process the received data, and any subsequent data we
461 : * can read without blocking.
462 : */
463 : for (;;)
464 : {
465 147489 : if (len > 0)
466 : {
467 : /*
468 : * Something was received from primary, so adjust
469 : * the ping and terminate wakeup times.
470 : */
471 106591 : now = GetCurrentTimestamp();
472 106591 : WalRcvComputeNextWakeup(WALRCV_WAKEUP_TERMINATE,
473 : now);
474 106591 : WalRcvComputeNextWakeup(WALRCV_WAKEUP_PING, now);
475 106591 : XLogWalRcvProcessMsg(buf[0], &buf[1], len - 1,
476 : startpointTLI);
477 : }
478 40898 : else if (len == 0)
479 40853 : break;
480 45 : else if (len < 0)
481 : {
482 45 : ereport(LOG,
483 : (errmsg("replication terminated by primary server"),
484 : errdetail("End of WAL reached on timeline %u at %X/%08X.",
485 : startpointTLI,
486 : LSN_FORMAT_ARGS(LogstreamResult.Write))));
487 45 : endofwal = true;
488 45 : break;
489 : }
490 106591 : len = walrcv_receive(wrconn, &buf, &wait_fd);
491 : }
492 :
493 : /* Let the primary know that we received some data. */
494 40898 : XLogWalRcvSendReply(false, false, false);
495 :
496 : /*
497 : * If we've written some records, flush them to disk and
498 : * let the startup process and primary server know about
499 : * them.
500 : */
501 40897 : XLogWalRcvFlush(false, startpointTLI);
502 : }
503 :
504 : /* Check if we need to exit the streaming loop. */
505 57964 : if (endofwal)
506 44 : break;
507 :
508 : /* Find the soonest wakeup time, to limit our nap. */
509 57920 : nextWakeup = TIMESTAMP_INFINITY;
510 289600 : for (int i = 0; i < NUM_WALRCV_WAKEUPS; ++i)
511 231680 : nextWakeup = Min(wakeup[i], nextWakeup);
512 :
513 : /* Calculate the nap time, clamping as necessary. */
514 57920 : now = GetCurrentTimestamp();
515 57920 : nap = TimestampDifferenceMilliseconds(now, nextWakeup);
516 :
517 : /*
518 : * Ideally we would reuse a WaitEventSet object repeatedly
519 : * here to avoid the overheads of WaitLatchOrSocket on epoll
520 : * systems, but we can't be sure that libpq (or any other
521 : * walreceiver implementation) has the same socket (even if
522 : * the fd is the same number, it may have been closed and
523 : * reopened since the last time). In future, if there is a
524 : * function for removing sockets from WaitEventSet, then we
525 : * could add and remove just the socket each time, potentially
526 : * avoiding some system calls.
527 : */
528 : Assert(wait_fd != PGINVALID_SOCKET);
529 57920 : rc = WaitLatchOrSocket(MyLatch,
530 : WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE |
531 : WL_TIMEOUT | WL_LATCH_SET,
532 : wait_fd,
533 : nap,
534 : WAIT_EVENT_WAL_RECEIVER_MAIN);
535 57920 : if (rc & WL_LATCH_SET)
536 : {
537 13585 : ResetLatch(MyLatch);
538 13585 : CHECK_FOR_INTERRUPTS();
539 :
540 13501 : if (walrcv->apply_reply_requested)
541 : {
542 : /*
543 : * The recovery process has asked us to send apply
544 : * feedback now. Make sure the flag is really set to
545 : * false in shared memory before sending the reply, so
546 : * we don't miss a new request for a reply.
547 : */
548 13431 : walrcv->apply_reply_requested = false;
549 13431 : pg_memory_barrier();
550 13431 : XLogWalRcvSendReply(false, false, true);
551 : }
552 : }
553 57836 : if (rc & WL_TIMEOUT)
554 : {
555 : /*
556 : * We didn't receive anything new. If we haven't heard
557 : * anything from the server for more than
558 : * wal_receiver_timeout / 2, ping the server. Also, if
559 : * it's been longer than wal_receiver_status_interval
560 : * since the last update we sent, send a status update to
561 : * the primary anyway, to report any progress in applying
562 : * WAL.
563 : */
564 7 : bool requestReply = false;
565 :
566 : /*
567 : * Report pending statistics to the cumulative stats
568 : * system. This location is useful for the report as it
569 : * is not within a tight loop in the WAL receiver, to
570 : * avoid bloating pgstats with requests, while also making
571 : * sure that the reports happen each time a status update
572 : * is sent.
573 : */
574 7 : pgstat_report_wal(false);
575 :
576 : /*
577 : * Check if time since last receive from primary has
578 : * reached the configured limit.
579 : */
580 7 : now = GetCurrentTimestamp();
581 7 : if (now >= wakeup[WALRCV_WAKEUP_TERMINATE])
582 0 : ereport(ERROR,
583 : (errcode(ERRCODE_CONNECTION_FAILURE),
584 : errmsg("terminating walreceiver due to timeout")));
585 :
586 : /*
587 : * If we didn't receive anything new for half of receiver
588 : * replication timeout, then ping the server.
589 : */
590 7 : if (now >= wakeup[WALRCV_WAKEUP_PING])
591 : {
592 0 : requestReply = true;
593 0 : wakeup[WALRCV_WAKEUP_PING] = TIMESTAMP_INFINITY;
594 : }
595 :
596 7 : XLogWalRcvSendReply(requestReply, requestReply, false);
597 7 : XLogWalRcvSendHSFeedback(false);
598 : }
599 : }
600 :
601 : /*
602 : * The backend finished streaming. Exit streaming COPY-mode from
603 : * our side, too.
604 : */
605 44 : walrcv_endstreaming(wrconn, &primaryTLI);
606 :
607 : /*
608 : * If the server had switched to a new timeline that we didn't
609 : * know about when we began streaming, fetch its timeline history
610 : * file now.
611 : */
612 12 : WalRcvFetchTimeLineHistoryFiles(startpointTLI, primaryTLI);
613 : }
614 : else
615 0 : ereport(LOG,
616 : (errmsg("primary server contains no more WAL on requested timeline %u",
617 : startpointTLI)));
618 :
619 : /*
620 : * End of WAL reached on the requested timeline. Close the last
621 : * segment, and await for new orders from the startup process.
622 : */
623 12 : if (recvFile >= 0)
624 : {
625 : char xlogfname[MAXFNAMELEN];
626 :
627 11 : XLogWalRcvFlush(false, startpointTLI);
628 11 : XLogFileName(xlogfname, recvFileTLI, recvSegNo, wal_segment_size);
629 11 : if (close(recvFile) != 0)
630 0 : ereport(PANIC,
631 : (errcode_for_file_access(),
632 : errmsg("could not close WAL segment %s: %m",
633 : xlogfname)));
634 :
635 : /*
636 : * Create .done file forcibly to prevent the streamed segment from
637 : * being archived later.
638 : */
639 11 : if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS)
640 11 : XLogArchiveForceDone(xlogfname);
641 : else
642 0 : XLogArchiveNotify(xlogfname);
643 : }
644 12 : recvFile = -1;
645 :
646 12 : elog(DEBUG1, "walreceiver ended streaming and awaits new instructions");
647 12 : WalRcvWaitForStartPosition(&startpoint, &startpointTLI);
648 : }
649 : /* not reached */
650 : }
651 :
652 : /*
653 : * Wait for startup process to set receiveStart and receiveStartTLI.
654 : */
655 : static void
656 12 : WalRcvWaitForStartPosition(XLogRecPtr *startpoint, TimeLineID *startpointTLI)
657 : {
658 12 : WalRcvData *walrcv = WalRcv;
659 : int state;
660 :
661 12 : SpinLockAcquire(&walrcv->mutex);
662 12 : state = walrcv->walRcvState;
663 12 : if (state != WALRCV_STREAMING && state != WALRCV_CONNECTING)
664 : {
665 0 : SpinLockRelease(&walrcv->mutex);
666 0 : if (state == WALRCV_STOPPING)
667 0 : proc_exit(0);
668 : else
669 0 : elog(FATAL, "unexpected walreceiver state");
670 : }
671 12 : walrcv->walRcvState = WALRCV_WAITING;
672 12 : walrcv->receiveStart = InvalidXLogRecPtr;
673 12 : walrcv->receiveStartTLI = 0;
674 12 : SpinLockRelease(&walrcv->mutex);
675 :
676 12 : set_ps_display("idle");
677 :
678 : /*
679 : * nudge startup process to notice that we've stopped streaming and are
680 : * now waiting for instructions.
681 : */
682 12 : WakeupRecovery();
683 : for (;;)
684 : {
685 24 : ResetLatch(MyLatch);
686 :
687 24 : CHECK_FOR_INTERRUPTS();
688 :
689 24 : SpinLockAcquire(&walrcv->mutex);
690 : Assert(walrcv->walRcvState == WALRCV_RESTARTING ||
691 : walrcv->walRcvState == WALRCV_WAITING ||
692 : walrcv->walRcvState == WALRCV_STOPPING);
693 24 : if (walrcv->walRcvState == WALRCV_RESTARTING)
694 : {
695 : /*
696 : * No need to handle changes in primary_conninfo or
697 : * primary_slot_name here. Startup process will signal us to
698 : * terminate in case those change.
699 : */
700 12 : *startpoint = walrcv->receiveStart;
701 12 : *startpointTLI = walrcv->receiveStartTLI;
702 12 : walrcv->walRcvState = WALRCV_CONNECTING;
703 12 : SpinLockRelease(&walrcv->mutex);
704 12 : break;
705 : }
706 12 : if (walrcv->walRcvState == WALRCV_STOPPING)
707 : {
708 : /*
709 : * We should've received SIGTERM if the startup process wants us
710 : * to die, but might as well check it here too.
711 : */
712 0 : SpinLockRelease(&walrcv->mutex);
713 0 : exit(1);
714 : }
715 12 : SpinLockRelease(&walrcv->mutex);
716 :
717 12 : (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
718 : WAIT_EVENT_WAL_RECEIVER_WAIT_START);
719 : }
720 :
721 12 : if (update_process_title)
722 : {
723 : char activitymsg[50];
724 :
725 12 : snprintf(activitymsg, sizeof(activitymsg), "restarting at %X/%08X",
726 12 : LSN_FORMAT_ARGS(*startpoint));
727 12 : set_ps_display(activitymsg);
728 : }
729 12 : }
730 :
731 : /*
732 : * Fetch any missing timeline history files between 'first' and 'last'
733 : * (inclusive) from the server.
734 : */
735 : static void
736 173 : WalRcvFetchTimeLineHistoryFiles(TimeLineID first, TimeLineID last)
737 : {
738 : TimeLineID tli;
739 :
740 368 : for (tli = first; tli <= last; tli++)
741 : {
742 : /* there's no history file for timeline 1 */
743 195 : if (tli != 1 && !existsTimeLineHistory(tli))
744 : {
745 : char *fname;
746 : char *content;
747 : int len;
748 : char expectedfname[MAXFNAMELEN];
749 :
750 11 : ereport(LOG,
751 : (errmsg("fetching timeline history file for timeline %u from primary server",
752 : tli)));
753 :
754 11 : walrcv_readtimelinehistoryfile(wrconn, tli, &fname, &content, &len);
755 :
756 : /*
757 : * Check that the filename on the primary matches what we
758 : * calculated ourselves. This is just a sanity check, it should
759 : * always match.
760 : */
761 11 : TLHistoryFileName(expectedfname, tli);
762 11 : if (strcmp(fname, expectedfname) != 0)
763 0 : ereport(ERROR,
764 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
765 : errmsg_internal("primary reported unexpected file name for timeline history file of timeline %u",
766 : tli)));
767 :
768 : /*
769 : * Write the file to pg_wal.
770 : */
771 11 : writeTimeLineHistoryFile(tli, content, len);
772 :
773 : /*
774 : * Mark the streamed history file as ready for archiving if
775 : * archive_mode is always.
776 : */
777 11 : if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS)
778 11 : XLogArchiveForceDone(fname);
779 : else
780 0 : XLogArchiveNotify(fname);
781 :
782 11 : pfree(fname);
783 11 : pfree(content);
784 : }
785 : }
786 173 : }
787 :
788 : /*
789 : * Mark us as STOPPED in shared memory at exit.
790 : */
791 : static void
792 259 : WalRcvDie(int code, Datum arg)
793 : {
794 259 : WalRcvData *walrcv = WalRcv;
795 259 : TimeLineID *startpointTLI_p = (TimeLineID *) DatumGetPointer(arg);
796 :
797 : Assert(*startpointTLI_p != 0);
798 :
799 : /* Ensure that all WAL records received are flushed to disk */
800 259 : XLogWalRcvFlush(true, *startpointTLI_p);
801 :
802 : /* Mark ourselves inactive in shared memory */
803 259 : SpinLockAcquire(&walrcv->mutex);
804 : Assert(walrcv->walRcvState == WALRCV_STREAMING ||
805 : walrcv->walRcvState == WALRCV_CONNECTING ||
806 : walrcv->walRcvState == WALRCV_RESTARTING ||
807 : walrcv->walRcvState == WALRCV_STARTING ||
808 : walrcv->walRcvState == WALRCV_WAITING ||
809 : walrcv->walRcvState == WALRCV_STOPPING);
810 : Assert(walrcv->pid == MyProcPid);
811 259 : walrcv->walRcvState = WALRCV_STOPPED;
812 259 : walrcv->pid = 0;
813 259 : walrcv->procno = INVALID_PROC_NUMBER;
814 259 : walrcv->ready_to_display = false;
815 259 : SpinLockRelease(&walrcv->mutex);
816 :
817 259 : ConditionVariableBroadcast(&walrcv->walRcvStoppedCV);
818 :
819 : /* Terminate the connection gracefully. */
820 259 : if (wrconn != NULL)
821 149 : walrcv_disconnect(wrconn);
822 :
823 : /* Wake up the startup process to notice promptly that we're gone */
824 259 : WakeupRecovery();
825 259 : }
826 :
827 : /*
828 : * Accept the message from XLOG stream, and process it.
829 : */
830 : static void
831 106591 : XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len, TimeLineID tli)
832 : {
833 : int hdrlen;
834 : XLogRecPtr dataStart;
835 : XLogRecPtr walEnd;
836 : TimestampTz sendTime;
837 : bool replyRequested;
838 :
839 106591 : switch (type)
840 : {
841 106340 : case PqReplMsg_WALData:
842 : {
843 : StringInfoData incoming_message;
844 :
845 106340 : hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
846 106340 : if (len < hdrlen)
847 0 : ereport(ERROR,
848 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
849 : errmsg_internal("invalid WAL message received from primary")));
850 :
851 : /* initialize a StringInfo with the given buffer */
852 106340 : initReadOnlyStringInfo(&incoming_message, buf, hdrlen);
853 :
854 : /* read the fields */
855 106340 : dataStart = pq_getmsgint64(&incoming_message);
856 106340 : walEnd = pq_getmsgint64(&incoming_message);
857 106340 : sendTime = pq_getmsgint64(&incoming_message);
858 106340 : ProcessWalSndrMessage(walEnd, sendTime);
859 :
860 106340 : buf += hdrlen;
861 106340 : len -= hdrlen;
862 106340 : XLogWalRcvWrite(buf, len, dataStart, tli);
863 106340 : break;
864 : }
865 251 : case PqReplMsg_Keepalive:
866 : {
867 : StringInfoData incoming_message;
868 :
869 251 : hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char);
870 251 : if (len != hdrlen)
871 0 : ereport(ERROR,
872 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
873 : errmsg_internal("invalid keepalive message received from primary")));
874 :
875 : /* initialize a StringInfo with the given buffer */
876 251 : initReadOnlyStringInfo(&incoming_message, buf, hdrlen);
877 :
878 : /* read the fields */
879 251 : walEnd = pq_getmsgint64(&incoming_message);
880 251 : sendTime = pq_getmsgint64(&incoming_message);
881 251 : replyRequested = pq_getmsgbyte(&incoming_message);
882 :
883 251 : ProcessWalSndrMessage(walEnd, sendTime);
884 :
885 : /* If the primary requested a reply, send one immediately */
886 251 : if (replyRequested)
887 251 : XLogWalRcvSendReply(true, false, false);
888 251 : break;
889 : }
890 0 : default:
891 0 : ereport(ERROR,
892 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
893 : errmsg_internal("invalid replication message type %d",
894 : type)));
895 : }
896 106591 : }
897 :
898 : /*
899 : * Write XLOG data to disk.
900 : */
901 : static void
902 106340 : XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli)
903 : {
904 : int startoff;
905 : int byteswritten;
906 : instr_time start;
907 :
908 : Assert(tli != 0);
909 :
910 213089 : while (nbytes > 0)
911 : {
912 : int segbytes;
913 :
914 : /* Close the current segment if it's completed */
915 106749 : if (recvFile >= 0 && !XLByteInSeg(recptr, recvSegNo, wal_segment_size))
916 409 : XLogWalRcvClose(recptr, tli);
917 :
918 106749 : if (recvFile < 0)
919 : {
920 : /* Create/use new log file */
921 884 : XLByteToSeg(recptr, recvSegNo, wal_segment_size);
922 884 : recvFile = XLogFileInit(recvSegNo, tli);
923 884 : recvFileTLI = tli;
924 : }
925 :
926 : /* Calculate the start offset of the received logs */
927 106749 : startoff = XLogSegmentOffset(recptr, wal_segment_size);
928 :
929 106749 : if (startoff + nbytes > wal_segment_size)
930 409 : segbytes = wal_segment_size - startoff;
931 : else
932 106340 : segbytes = nbytes;
933 :
934 : /* OK to write the logs */
935 106749 : errno = 0;
936 :
937 : /*
938 : * Measure I/O timing to write WAL data, for pg_stat_io.
939 : */
940 106749 : start = pgstat_prepare_io_time(track_wal_io_timing);
941 :
942 106749 : pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
943 106749 : byteswritten = pg_pwrite(recvFile, buf, segbytes, (pgoff_t) startoff);
944 106749 : pgstat_report_wait_end();
945 :
946 106749 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL,
947 : IOOP_WRITE, start, 1, byteswritten);
948 :
949 106749 : if (byteswritten <= 0)
950 : {
951 : char xlogfname[MAXFNAMELEN];
952 : int save_errno;
953 :
954 : /* if write didn't set errno, assume no disk space */
955 0 : if (errno == 0)
956 0 : errno = ENOSPC;
957 :
958 0 : save_errno = errno;
959 0 : XLogFileName(xlogfname, recvFileTLI, recvSegNo, wal_segment_size);
960 0 : errno = save_errno;
961 0 : ereport(PANIC,
962 : (errcode_for_file_access(),
963 : errmsg("could not write to WAL segment %s "
964 : "at offset %d, length %d: %m",
965 : xlogfname, startoff, segbytes)));
966 : }
967 :
968 : /* Update state for write */
969 106749 : recptr += byteswritten;
970 :
971 106749 : nbytes -= byteswritten;
972 106749 : buf += byteswritten;
973 :
974 106749 : LogstreamResult.Write = recptr;
975 : }
976 :
977 : /* Update shared-memory status */
978 106340 : pg_atomic_write_u64(&WalRcv->writtenUpto, LogstreamResult.Write);
979 :
980 : /*
981 : * If we wrote an LSN that someone was waiting for, notify the waiters.
982 : */
983 212680 : if (waitLSNState &&
984 106340 : (LogstreamResult.Write >=
985 106340 : pg_atomic_read_u64(&waitLSNState->minWaitedLSN[WAIT_LSN_TYPE_STANDBY_WRITE])))
986 2 : WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_WRITE, LogstreamResult.Write);
987 :
988 : /*
989 : * Close the current segment if it's fully written up in the last cycle of
990 : * the loop, to create its archive notification file soon. Otherwise WAL
991 : * archiving of the segment will be delayed until any data in the next
992 : * segment is received and written.
993 : */
994 106340 : if (recvFile >= 0 && !XLByteInSeg(recptr, recvSegNo, wal_segment_size))
995 340 : XLogWalRcvClose(recptr, tli);
996 106340 : }
997 :
998 : /*
999 : * Flush the log to disk.
1000 : *
1001 : * If we're in the midst of dying, it's unwise to do anything that might throw
1002 : * an error, so we skip sending a reply in that case.
1003 : */
1004 : static void
1005 41916 : XLogWalRcvFlush(bool dying, TimeLineID tli)
1006 : {
1007 : Assert(tli != 0);
1008 :
1009 41916 : if (LogstreamResult.Flush < LogstreamResult.Write)
1010 : {
1011 41178 : WalRcvData *walrcv = WalRcv;
1012 :
1013 41178 : issue_xlog_fsync(recvFile, recvSegNo, tli);
1014 :
1015 41178 : LogstreamResult.Flush = LogstreamResult.Write;
1016 :
1017 : /* Update shared-memory status */
1018 41178 : SpinLockAcquire(&walrcv->mutex);
1019 41178 : if (walrcv->flushedUpto < LogstreamResult.Flush)
1020 : {
1021 41178 : walrcv->latestChunkStart = walrcv->flushedUpto;
1022 41178 : walrcv->flushedUpto = LogstreamResult.Flush;
1023 41178 : walrcv->receivedTLI = tli;
1024 : }
1025 41178 : SpinLockRelease(&walrcv->mutex);
1026 :
1027 : /*
1028 : * If we flushed an LSN that someone was waiting for, notify the
1029 : * waiters.
1030 : */
1031 82356 : if (waitLSNState &&
1032 41178 : (LogstreamResult.Flush >=
1033 41178 : pg_atomic_read_u64(&waitLSNState->minWaitedLSN[WAIT_LSN_TYPE_STANDBY_FLUSH])))
1034 2 : WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_FLUSH, LogstreamResult.Flush);
1035 :
1036 : /* Signal the startup process and walsender that new WAL has arrived */
1037 41178 : WakeupRecovery();
1038 41178 : if (AllowCascadeReplication())
1039 41178 : WalSndWakeup(true, false);
1040 :
1041 : /* Report XLOG streaming progress in PS display */
1042 41178 : if (update_process_title)
1043 : {
1044 : char activitymsg[50];
1045 :
1046 41178 : snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%08X",
1047 41178 : LSN_FORMAT_ARGS(LogstreamResult.Write));
1048 41178 : set_ps_display(activitymsg);
1049 : }
1050 :
1051 : /* Also let the primary know that we made some progress */
1052 41178 : if (!dying)
1053 : {
1054 41176 : XLogWalRcvSendReply(false, false, false);
1055 41176 : XLogWalRcvSendHSFeedback(false);
1056 : }
1057 : }
1058 41916 : }
1059 :
1060 : /*
1061 : * Close the current segment.
1062 : *
1063 : * Flush the segment to disk before closing it. Otherwise we have to
1064 : * reopen and fsync it later.
1065 : *
1066 : * Create an archive notification file since the segment is known completed.
1067 : */
1068 : static void
1069 749 : XLogWalRcvClose(XLogRecPtr recptr, TimeLineID tli)
1070 : {
1071 : char xlogfname[MAXFNAMELEN];
1072 :
1073 : Assert(recvFile >= 0 && !XLByteInSeg(recptr, recvSegNo, wal_segment_size));
1074 : Assert(tli != 0);
1075 :
1076 : /*
1077 : * fsync() and close current file before we switch to next one. We would
1078 : * otherwise have to reopen this file to fsync it later
1079 : */
1080 749 : XLogWalRcvFlush(false, tli);
1081 :
1082 749 : XLogFileName(xlogfname, recvFileTLI, recvSegNo, wal_segment_size);
1083 :
1084 : /*
1085 : * XLOG segment files will be re-read by recovery in startup process soon,
1086 : * so we don't advise the OS to release cache pages associated with the
1087 : * file like XLogFileClose() does.
1088 : */
1089 749 : if (close(recvFile) != 0)
1090 0 : ereport(PANIC,
1091 : (errcode_for_file_access(),
1092 : errmsg("could not close WAL segment %s: %m",
1093 : xlogfname)));
1094 :
1095 : /*
1096 : * Create .done file forcibly to prevent the streamed segment from being
1097 : * archived later.
1098 : */
1099 749 : if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS)
1100 749 : XLogArchiveForceDone(xlogfname);
1101 : else
1102 0 : XLogArchiveNotify(xlogfname);
1103 :
1104 749 : recvFile = -1;
1105 749 : }
1106 :
1107 : /*
1108 : * Send reply message to primary, indicating our current WAL locations and
1109 : * time.
1110 : *
1111 : * The message is sent if 'force' is set, if enough time has passed since the
1112 : * last update to reach wal_receiver_status_interval, or if WAL locations have
1113 : * advanced since the previous status update. If wal_receiver_status_interval
1114 : * is disabled and 'force' is false, this function does nothing. Set 'force' to
1115 : * send the message unconditionally.
1116 : *
1117 : * Whether WAL locations are considered "advanced" depends on 'checkApply'.
1118 : * If 'checkApply' is false, only the write and flush locations are checked.
1119 : * This should be used when the call is triggered by write/flush activity
1120 : * (e.g., after walreceiver writes or flushes WAL), and avoids the
1121 : * apply-location check, which requires a spinlock. If 'checkApply' is true,
1122 : * the apply location is also considered. This should be used when the apply
1123 : * location is expected to advance (e.g., when the startup process requests
1124 : * an apply notification).
1125 : *
1126 : * If 'requestReply' is true, requests the server to reply immediately upon
1127 : * receiving this message. This is used for heartbeats, when approaching
1128 : * wal_receiver_timeout.
1129 : */
1130 : static void
1131 95922 : XLogWalRcvSendReply(bool force, bool requestReply, bool checkApply)
1132 : {
1133 : static XLogRecPtr writePtr = InvalidXLogRecPtr;
1134 : static XLogRecPtr flushPtr = InvalidXLogRecPtr;
1135 : static XLogRecPtr applyPtr = InvalidXLogRecPtr;
1136 95922 : XLogRecPtr latestApplyPtr = InvalidXLogRecPtr;
1137 : TimestampTz now;
1138 :
1139 : /*
1140 : * If the user doesn't want status to be reported to the primary, be sure
1141 : * to exit before doing anything at all.
1142 : */
1143 95922 : if (!force && wal_receiver_status_interval <= 0)
1144 0 : return;
1145 :
1146 : /* Get current timestamp. */
1147 95922 : now = GetCurrentTimestamp();
1148 :
1149 : /*
1150 : * We can compare the write and flush positions to the last message we
1151 : * sent without taking any lock, but the apply position requires a spin
1152 : * lock, so we don't check that unless it is expected to advance since the
1153 : * previous update, i.e., when 'checkApply' is true.
1154 : */
1155 95922 : if (!force && now < wakeup[WALRCV_WAKEUP_REPLY])
1156 : {
1157 95510 : if (checkApply)
1158 13431 : latestApplyPtr = GetXLogReplayRecPtr(NULL);
1159 :
1160 95510 : if (writePtr == LogstreamResult.Write
1161 54175 : && flushPtr == LogstreamResult.Flush
1162 13748 : && (!checkApply || applyPtr == latestApplyPtr))
1163 3318 : return;
1164 : }
1165 :
1166 : /* Make sure we wake up when it's time to send another reply. */
1167 92604 : WalRcvComputeNextWakeup(WALRCV_WAKEUP_REPLY, now);
1168 :
1169 : /* Construct a new message */
1170 92604 : writePtr = LogstreamResult.Write;
1171 92604 : flushPtr = LogstreamResult.Flush;
1172 92604 : applyPtr = (latestApplyPtr == InvalidXLogRecPtr) ?
1173 92604 : GetXLogReplayRecPtr(NULL) : latestApplyPtr;
1174 :
1175 92604 : resetStringInfo(&reply_message);
1176 92604 : pq_sendbyte(&reply_message, PqReplMsg_StandbyStatusUpdate);
1177 92604 : pq_sendint64(&reply_message, writePtr);
1178 92604 : pq_sendint64(&reply_message, flushPtr);
1179 92604 : pq_sendint64(&reply_message, applyPtr);
1180 92604 : pq_sendint64(&reply_message, GetCurrentTimestamp());
1181 92604 : pq_sendbyte(&reply_message, requestReply ? 1 : 0);
1182 :
1183 : /* Send it */
1184 92604 : elog(DEBUG2, "sending write %X/%08X flush %X/%08X apply %X/%08X%s",
1185 : LSN_FORMAT_ARGS(writePtr),
1186 : LSN_FORMAT_ARGS(flushPtr),
1187 : LSN_FORMAT_ARGS(applyPtr),
1188 : requestReply ? " (reply requested)" : "");
1189 :
1190 92604 : walrcv_send(wrconn, reply_message.data, reply_message.len);
1191 : }
1192 :
1193 : /*
1194 : * Send hot standby feedback message to primary, plus the current time,
1195 : * in case they don't have a watch.
1196 : *
1197 : * If the user disables feedback, send one final message to tell sender
1198 : * to forget about the xmin on this standby. We also send this message
1199 : * on first connect because a previous connection might have set xmin
1200 : * on a replication slot. (If we're not using a slot it's harmless to
1201 : * send a feedback message explicitly setting InvalidTransactionId).
1202 : */
1203 : static void
1204 41370 : XLogWalRcvSendHSFeedback(bool immed)
1205 : {
1206 : TimestampTz now;
1207 : FullTransactionId nextFullXid;
1208 : TransactionId nextXid;
1209 : uint32 xmin_epoch,
1210 : catalog_xmin_epoch;
1211 : TransactionId xmin,
1212 : catalog_xmin;
1213 :
1214 : /* initially true so we always send at least one feedback message */
1215 : static bool primary_has_standby_xmin = true;
1216 :
1217 : /*
1218 : * If the user doesn't want status to be reported to the primary, be sure
1219 : * to exit before doing anything at all.
1220 : */
1221 41370 : if ((wal_receiver_status_interval <= 0 || !hot_standby_feedback) &&
1222 40779 : !primary_has_standby_xmin)
1223 41197 : return;
1224 :
1225 : /* Get current timestamp. */
1226 737 : now = GetCurrentTimestamp();
1227 :
1228 : /* Send feedback at most once per wal_receiver_status_interval. */
1229 737 : if (!immed && now < wakeup[WALRCV_WAKEUP_HSFEEDBACK])
1230 563 : return;
1231 :
1232 : /* Make sure we wake up when it's time to send feedback again. */
1233 174 : WalRcvComputeNextWakeup(WALRCV_WAKEUP_HSFEEDBACK, now);
1234 :
1235 : /*
1236 : * If Hot Standby is not yet accepting connections there is nothing to
1237 : * send. Check this after the interval has expired to reduce number of
1238 : * calls.
1239 : *
1240 : * Bailing out here also ensures that we don't send feedback until we've
1241 : * read our own replication slot state, so we don't tell the primary to
1242 : * discard needed xmin or catalog_xmin from any slots that may exist on
1243 : * this replica.
1244 : */
1245 174 : if (!HotStandbyActive())
1246 1 : return;
1247 :
1248 : /*
1249 : * Make the expensive call to get the oldest xmin once we are certain
1250 : * everything else has been checked.
1251 : */
1252 173 : if (hot_standby_feedback)
1253 : {
1254 56 : GetReplicationHorizons(&xmin, &catalog_xmin);
1255 : }
1256 : else
1257 : {
1258 117 : xmin = InvalidTransactionId;
1259 117 : catalog_xmin = InvalidTransactionId;
1260 : }
1261 :
1262 : /*
1263 : * Get epoch and adjust if nextXid and oldestXmin are different sides of
1264 : * the epoch boundary.
1265 : */
1266 173 : nextFullXid = ReadNextFullTransactionId();
1267 173 : nextXid = XidFromFullTransactionId(nextFullXid);
1268 173 : xmin_epoch = EpochFromFullTransactionId(nextFullXid);
1269 173 : catalog_xmin_epoch = xmin_epoch;
1270 173 : if (nextXid < xmin)
1271 0 : xmin_epoch--;
1272 173 : if (nextXid < catalog_xmin)
1273 0 : catalog_xmin_epoch--;
1274 :
1275 173 : elog(DEBUG2, "sending hot standby feedback xmin %u epoch %u catalog_xmin %u catalog_xmin_epoch %u",
1276 : xmin, xmin_epoch, catalog_xmin, catalog_xmin_epoch);
1277 :
1278 : /* Construct the message and send it. */
1279 173 : resetStringInfo(&reply_message);
1280 173 : pq_sendbyte(&reply_message, PqReplMsg_HotStandbyFeedback);
1281 173 : pq_sendint64(&reply_message, GetCurrentTimestamp());
1282 173 : pq_sendint32(&reply_message, xmin);
1283 173 : pq_sendint32(&reply_message, xmin_epoch);
1284 173 : pq_sendint32(&reply_message, catalog_xmin);
1285 173 : pq_sendint32(&reply_message, catalog_xmin_epoch);
1286 173 : walrcv_send(wrconn, reply_message.data, reply_message.len);
1287 173 : if (TransactionIdIsValid(xmin) || TransactionIdIsValid(catalog_xmin))
1288 56 : primary_has_standby_xmin = true;
1289 : else
1290 117 : primary_has_standby_xmin = false;
1291 : }
1292 :
1293 : /*
1294 : * Update shared memory status upon receiving a message from primary.
1295 : *
1296 : * 'walEnd' and 'sendTime' are the end-of-WAL and timestamp of the latest
1297 : * message, reported by primary.
1298 : */
1299 : static void
1300 106591 : ProcessWalSndrMessage(XLogRecPtr walEnd, TimestampTz sendTime)
1301 : {
1302 106591 : WalRcvData *walrcv = WalRcv;
1303 106591 : TimestampTz lastMsgReceiptTime = GetCurrentTimestamp();
1304 :
1305 : /* Update shared-memory status */
1306 106591 : SpinLockAcquire(&walrcv->mutex);
1307 106591 : if (walrcv->latestWalEnd < walEnd)
1308 22354 : walrcv->latestWalEndTime = sendTime;
1309 106591 : walrcv->latestWalEnd = walEnd;
1310 106591 : walrcv->lastMsgSendTime = sendTime;
1311 106591 : walrcv->lastMsgReceiptTime = lastMsgReceiptTime;
1312 106591 : SpinLockRelease(&walrcv->mutex);
1313 :
1314 106591 : if (message_level_is_interesting(DEBUG2))
1315 : {
1316 : char *sendtime;
1317 : char *receipttime;
1318 : int applyDelay;
1319 :
1320 : /* Copy because timestamptz_to_str returns a static buffer */
1321 411 : sendtime = pstrdup(timestamptz_to_str(sendTime));
1322 411 : receipttime = pstrdup(timestamptz_to_str(lastMsgReceiptTime));
1323 411 : applyDelay = GetReplicationApplyDelay();
1324 :
1325 : /* apply delay is not available */
1326 411 : if (applyDelay == -1)
1327 9 : elog(DEBUG2, "sendtime %s receipttime %s replication apply delay (N/A) transfer latency %d ms",
1328 : sendtime,
1329 : receipttime,
1330 : GetReplicationTransferLatency());
1331 : else
1332 402 : elog(DEBUG2, "sendtime %s receipttime %s replication apply delay %d ms transfer latency %d ms",
1333 : sendtime,
1334 : receipttime,
1335 : applyDelay,
1336 : GetReplicationTransferLatency());
1337 :
1338 411 : pfree(sendtime);
1339 411 : pfree(receipttime);
1340 : }
1341 106591 : }
1342 :
1343 : /*
1344 : * Compute the next wakeup time for a given wakeup reason. Can be called to
1345 : * initialize a wakeup time, to adjust it for the next wakeup, or to
1346 : * reinitialize it when GUCs have changed. We ask the caller to pass in the
1347 : * value of "now" because this frequently avoids multiple calls of
1348 : * GetCurrentTimestamp(). It had better be a reasonably up-to-date value
1349 : * though.
1350 : */
1351 : static void
1352 306708 : WalRcvComputeNextWakeup(WalRcvWakeupReason reason, TimestampTz now)
1353 : {
1354 306708 : switch (reason)
1355 : {
1356 106778 : case WALRCV_WAKEUP_TERMINATE:
1357 106778 : if (wal_receiver_timeout <= 0)
1358 0 : wakeup[reason] = TIMESTAMP_INFINITY;
1359 : else
1360 106778 : wakeup[reason] = TimestampTzPlusMilliseconds(now, wal_receiver_timeout);
1361 106778 : break;
1362 106778 : case WALRCV_WAKEUP_PING:
1363 106778 : if (wal_receiver_timeout <= 0)
1364 0 : wakeup[reason] = TIMESTAMP_INFINITY;
1365 : else
1366 106778 : wakeup[reason] = TimestampTzPlusMilliseconds(now, wal_receiver_timeout / 2);
1367 106778 : break;
1368 361 : case WALRCV_WAKEUP_HSFEEDBACK:
1369 361 : if (!hot_standby_feedback || wal_receiver_status_interval <= 0)
1370 254 : wakeup[reason] = TIMESTAMP_INFINITY;
1371 : else
1372 107 : wakeup[reason] = TimestampTzPlusSeconds(now, wal_receiver_status_interval);
1373 361 : break;
1374 92791 : case WALRCV_WAKEUP_REPLY:
1375 92791 : if (wal_receiver_status_interval <= 0)
1376 0 : wakeup[reason] = TIMESTAMP_INFINITY;
1377 : else
1378 92791 : wakeup[reason] = TimestampTzPlusSeconds(now, wal_receiver_status_interval);
1379 92791 : break;
1380 : /* there's intentionally no default: here */
1381 : }
1382 306708 : }
1383 :
1384 : /*
1385 : * Wake up the walreceiver main loop.
1386 : *
1387 : * This is called by the startup process whenever interesting xlog records
1388 : * are applied, so that walreceiver can check if it needs to send an apply
1389 : * notification back to the primary which may be waiting in a COMMIT with
1390 : * synchronous_commit = remote_apply.
1391 : */
1392 : void
1393 13326 : WalRcvRequestApplyReply(void)
1394 : {
1395 : ProcNumber procno;
1396 :
1397 13326 : WalRcv->apply_reply_requested = true;
1398 : /* fetching the proc number is probably atomic, but don't rely on it */
1399 13326 : SpinLockAcquire(&WalRcv->mutex);
1400 13326 : procno = WalRcv->procno;
1401 13326 : SpinLockRelease(&WalRcv->mutex);
1402 13326 : if (procno != INVALID_PROC_NUMBER)
1403 13143 : SetLatch(&GetPGProcByNumber(procno)->procLatch);
1404 13326 : }
1405 :
1406 : /*
1407 : * Return a string constant representing the state. This is used
1408 : * in system functions and views, and should *not* be translated.
1409 : */
1410 : static const char *
1411 13 : WalRcvGetStateString(WalRcvState state)
1412 : {
1413 13 : switch (state)
1414 : {
1415 0 : case WALRCV_STOPPED:
1416 0 : return "stopped";
1417 0 : case WALRCV_STARTING:
1418 0 : return "starting";
1419 0 : case WALRCV_CONNECTING:
1420 0 : return "connecting";
1421 13 : case WALRCV_STREAMING:
1422 13 : return "streaming";
1423 0 : case WALRCV_WAITING:
1424 0 : return "waiting";
1425 0 : case WALRCV_RESTARTING:
1426 0 : return "restarting";
1427 0 : case WALRCV_STOPPING:
1428 0 : return "stopping";
1429 : }
1430 0 : return "UNKNOWN";
1431 : }
1432 :
1433 : /*
1434 : * Returns activity of WAL receiver, including pid, state and xlog locations
1435 : * received from the WAL sender of another server.
1436 : */
1437 : Datum
1438 22 : pg_stat_get_wal_receiver(PG_FUNCTION_ARGS)
1439 : {
1440 : TupleDesc tupdesc;
1441 : Datum *values;
1442 : bool *nulls;
1443 : int pid;
1444 : bool ready_to_display;
1445 : WalRcvState state;
1446 : XLogRecPtr receive_start_lsn;
1447 : TimeLineID receive_start_tli;
1448 : XLogRecPtr written_lsn;
1449 : XLogRecPtr flushed_lsn;
1450 : TimeLineID received_tli;
1451 : TimestampTz last_send_time;
1452 : TimestampTz last_receipt_time;
1453 : XLogRecPtr latest_end_lsn;
1454 : TimestampTz latest_end_time;
1455 : char sender_host[NI_MAXHOST];
1456 22 : int sender_port = 0;
1457 : char slotname[NAMEDATALEN];
1458 : char conninfo[MAXCONNINFO];
1459 :
1460 : /* Take a lock to ensure value consistency */
1461 22 : SpinLockAcquire(&WalRcv->mutex);
1462 22 : pid = (int) WalRcv->pid;
1463 22 : ready_to_display = WalRcv->ready_to_display;
1464 22 : state = WalRcv->walRcvState;
1465 22 : receive_start_lsn = WalRcv->receiveStart;
1466 22 : receive_start_tli = WalRcv->receiveStartTLI;
1467 22 : flushed_lsn = WalRcv->flushedUpto;
1468 22 : received_tli = WalRcv->receivedTLI;
1469 22 : last_send_time = WalRcv->lastMsgSendTime;
1470 22 : last_receipt_time = WalRcv->lastMsgReceiptTime;
1471 22 : latest_end_lsn = WalRcv->latestWalEnd;
1472 22 : latest_end_time = WalRcv->latestWalEndTime;
1473 22 : strlcpy(slotname, WalRcv->slotname, sizeof(slotname));
1474 22 : strlcpy(sender_host, WalRcv->sender_host, sizeof(sender_host));
1475 22 : sender_port = WalRcv->sender_port;
1476 22 : strlcpy(conninfo, WalRcv->conninfo, sizeof(conninfo));
1477 22 : SpinLockRelease(&WalRcv->mutex);
1478 :
1479 : /*
1480 : * No WAL receiver (or not ready yet), just return a tuple with NULL
1481 : * values
1482 : */
1483 22 : if (pid == 0 || !ready_to_display)
1484 9 : PG_RETURN_NULL();
1485 :
1486 : /*
1487 : * Read "writtenUpto" without holding a spinlock. Note that it may not be
1488 : * consistent with the other shared variables of the WAL receiver
1489 : * protected by a spinlock, but this should not be used for data integrity
1490 : * checks.
1491 : */
1492 13 : written_lsn = pg_atomic_read_u64(&WalRcv->writtenUpto);
1493 :
1494 : /* determine result type */
1495 13 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1496 0 : elog(ERROR, "return type must be a row type");
1497 :
1498 13 : values = palloc0_array(Datum, tupdesc->natts);
1499 13 : nulls = palloc0_array(bool, tupdesc->natts);
1500 :
1501 : /* Fetch values */
1502 13 : values[0] = Int32GetDatum(pid);
1503 :
1504 13 : if (!has_privs_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS))
1505 : {
1506 : /*
1507 : * Only superusers and roles with privileges of pg_read_all_stats can
1508 : * see details. Other users only get the pid value to know whether it
1509 : * is a WAL receiver, but no details.
1510 : */
1511 0 : memset(&nulls[1], true, sizeof(bool) * (tupdesc->natts - 1));
1512 : }
1513 : else
1514 : {
1515 13 : values[1] = CStringGetTextDatum(WalRcvGetStateString(state));
1516 :
1517 13 : if (!XLogRecPtrIsValid(receive_start_lsn))
1518 0 : nulls[2] = true;
1519 : else
1520 13 : values[2] = LSNGetDatum(receive_start_lsn);
1521 13 : values[3] = Int32GetDatum(receive_start_tli);
1522 13 : if (!XLogRecPtrIsValid(written_lsn))
1523 0 : nulls[4] = true;
1524 : else
1525 13 : values[4] = LSNGetDatum(written_lsn);
1526 13 : if (!XLogRecPtrIsValid(flushed_lsn))
1527 0 : nulls[5] = true;
1528 : else
1529 13 : values[5] = LSNGetDatum(flushed_lsn);
1530 13 : values[6] = Int32GetDatum(received_tli);
1531 13 : if (last_send_time == 0)
1532 0 : nulls[7] = true;
1533 : else
1534 13 : values[7] = TimestampTzGetDatum(last_send_time);
1535 13 : if (last_receipt_time == 0)
1536 0 : nulls[8] = true;
1537 : else
1538 13 : values[8] = TimestampTzGetDatum(last_receipt_time);
1539 13 : if (!XLogRecPtrIsValid(latest_end_lsn))
1540 0 : nulls[9] = true;
1541 : else
1542 13 : values[9] = LSNGetDatum(latest_end_lsn);
1543 13 : if (latest_end_time == 0)
1544 0 : nulls[10] = true;
1545 : else
1546 13 : values[10] = TimestampTzGetDatum(latest_end_time);
1547 13 : if (*slotname == '\0')
1548 11 : nulls[11] = true;
1549 : else
1550 2 : values[11] = CStringGetTextDatum(slotname);
1551 13 : if (*sender_host == '\0')
1552 0 : nulls[12] = true;
1553 : else
1554 13 : values[12] = CStringGetTextDatum(sender_host);
1555 13 : if (sender_port == 0)
1556 0 : nulls[13] = true;
1557 : else
1558 13 : values[13] = Int32GetDatum(sender_port);
1559 13 : if (*conninfo == '\0')
1560 0 : nulls[14] = true;
1561 : else
1562 13 : values[14] = CStringGetTextDatum(conninfo);
1563 : }
1564 :
1565 : /* Returns the record as Datum */
1566 13 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1567 : }
|