Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * slotsync.c
3 : * Functionality for synchronizing slots to a standby server from the
4 : * primary server.
5 : *
6 : * Copyright (c) 2024-2025, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/backend/replication/logical/slotsync.c
10 : *
11 : * This file contains the code for slot synchronization on a physical standby
12 : * to fetch logical failover slots information from the primary server, create
13 : * the slots on the standby and synchronize them periodically.
14 : *
15 : * Slot synchronization can be performed either automatically by enabling slot
16 : * sync worker or manually by calling SQL function pg_sync_replication_slots().
17 : *
18 : * If the WAL corresponding to the remote's restart_lsn is not available on the
19 : * physical standby or the remote's catalog_xmin precedes the oldest xid for
20 : * which it is guaranteed that rows wouldn't have been removed then we cannot
21 : * create the local standby slot because that would mean moving the local slot
22 : * backward and decoding won't be possible via such a slot. In this case, the
23 : * slot will be marked as RS_TEMPORARY. Once the primary server catches up,
24 : * the slot will be marked as RS_PERSISTENT (which means sync-ready) after
25 : * which slot sync worker can perform the sync periodically or user can call
26 : * pg_sync_replication_slots() periodically to perform the syncs.
27 : *
28 : * If synchronized slots fail to build a consistent snapshot from the
29 : * restart_lsn before reaching confirmed_flush_lsn, they would become
30 : * unreliable after promotion due to potential data loss from changes
31 : * before reaching a consistent point. This can happen because the slots can
32 : * be synced at some random time and we may not reach the consistent point
33 : * at the same WAL location as the primary. So, we mark such slots as
34 : * RS_TEMPORARY. Once the decoding from corresponding LSNs can reach a
35 : * consistent point, they will be marked as RS_PERSISTENT.
36 : *
37 : * The slot sync worker waits for some time before the next synchronization,
38 : * with the duration varying based on whether any slots were updated during
39 : * the last cycle. Refer to the comments above wait_for_slot_activity() for
40 : * more details.
41 : *
42 : * Any standby synchronized slots will be dropped if they no longer need
43 : * to be synchronized. See comment atop drop_local_obsolete_slots() for more
44 : * details.
45 : *---------------------------------------------------------------------------
46 : */
47 :
48 : #include "postgres.h"
49 :
50 : #include <time.h>
51 :
52 : #include "access/xlog_internal.h"
53 : #include "access/xlogrecovery.h"
54 : #include "catalog/pg_database.h"
55 : #include "commands/dbcommands.h"
56 : #include "libpq/pqsignal.h"
57 : #include "pgstat.h"
58 : #include "postmaster/interrupt.h"
59 : #include "replication/logical.h"
60 : #include "replication/slotsync.h"
61 : #include "replication/snapbuild.h"
62 : #include "storage/ipc.h"
63 : #include "storage/lmgr.h"
64 : #include "storage/proc.h"
65 : #include "storage/procarray.h"
66 : #include "tcop/tcopprot.h"
67 : #include "utils/builtins.h"
68 : #include "utils/pg_lsn.h"
69 : #include "utils/ps_status.h"
70 : #include "utils/timeout.h"
71 :
72 : /*
73 : * Struct for sharing information to control slot synchronization.
74 : *
75 : * The slot sync worker's pid is needed by the startup process to shut it
76 : * down during promotion. The startup process shuts down the slot sync worker
77 : * and also sets stopSignaled=true to handle the race condition when the
78 : * postmaster has not noticed the promotion yet and thus may end up restarting
79 : * the slot sync worker. If stopSignaled is set, the worker will exit in such a
80 : * case. The SQL function pg_sync_replication_slots() will also error out if
81 : * this flag is set. Note that we don't need to reset this variable as after
82 : * promotion the slot sync worker won't be restarted because the pmState
83 : * changes to PM_RUN from PM_HOT_STANDBY and we don't support demoting
84 : * primary without restarting the server. See LaunchMissingBackgroundProcesses.
85 : *
86 : * The 'syncing' flag is needed to prevent concurrent slot syncs to avoid slot
87 : * overwrites.
88 : *
89 : * The 'last_start_time' is needed by postmaster to start the slot sync worker
90 : * once per SLOTSYNC_RESTART_INTERVAL_SEC. In cases where an immediate restart
91 : * is expected (e.g., slot sync GUCs change), slot sync worker will reset
92 : * last_start_time before exiting, so that postmaster can start the worker
93 : * without waiting for SLOTSYNC_RESTART_INTERVAL_SEC.
94 : */
95 : typedef struct SlotSyncCtxStruct
96 : {
97 : pid_t pid;
98 : bool stopSignaled;
99 : bool syncing;
100 : time_t last_start_time;
101 : slock_t mutex;
102 : } SlotSyncCtxStruct;
103 :
104 : static SlotSyncCtxStruct *SlotSyncCtx = NULL;
105 :
106 : /* GUC variable */
107 : bool sync_replication_slots = false;
108 :
109 : /*
110 : * The sleep time (ms) between slot-sync cycles varies dynamically
111 : * (within a MIN/MAX range) according to slot activity. See
112 : * wait_for_slot_activity() for details.
113 : */
114 : #define MIN_SLOTSYNC_WORKER_NAPTIME_MS 200
115 : #define MAX_SLOTSYNC_WORKER_NAPTIME_MS 30000 /* 30s */
116 :
117 : static long sleep_ms = MIN_SLOTSYNC_WORKER_NAPTIME_MS;
118 :
119 : /* The restart interval for slot sync work used by postmaster */
120 : #define SLOTSYNC_RESTART_INTERVAL_SEC 10
121 :
122 : /*
123 : * Flag to tell if we are syncing replication slots. Unlike the 'syncing' flag
124 : * in SlotSyncCtxStruct, this flag is true only if the current process is
125 : * performing slot synchronization.
126 : */
127 : static bool syncing_slots = false;
128 :
129 : /*
130 : * Structure to hold information fetched from the primary server about a logical
131 : * replication slot.
132 : */
133 : typedef struct RemoteSlot
134 : {
135 : char *name;
136 : char *plugin;
137 : char *database;
138 : bool two_phase;
139 : bool failover;
140 : XLogRecPtr restart_lsn;
141 : XLogRecPtr confirmed_lsn;
142 : XLogRecPtr two_phase_at;
143 : TransactionId catalog_xmin;
144 :
145 : /* RS_INVAL_NONE if valid, or the reason of invalidation */
146 : ReplicationSlotInvalidationCause invalidated;
147 : } RemoteSlot;
148 :
149 : static void slotsync_failure_callback(int code, Datum arg);
150 : static void update_synced_slots_inactive_since(void);
151 :
152 : /*
153 : * If necessary, update the local synced slot's metadata based on the data
154 : * from the remote slot.
155 : *
156 : * If no update was needed (the data of the remote slot is the same as the
157 : * local slot) return false, otherwise true.
158 : *
159 : * *found_consistent_snapshot will be true iff the remote slot's LSN or xmin is
160 : * modified, and decoding from the corresponding LSN's can reach a
161 : * consistent snapshot.
162 : *
163 : * *remote_slot_precedes will be true if the remote slot's LSN or xmin
164 : * precedes locally reserved position.
165 : */
166 : static bool
167 64 : update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
168 : bool *found_consistent_snapshot,
169 : bool *remote_slot_precedes)
170 : {
171 64 : ReplicationSlot *slot = MyReplicationSlot;
172 64 : bool updated_xmin_or_lsn = false;
173 64 : bool updated_config = false;
174 :
175 : Assert(slot->data.invalidated == RS_INVAL_NONE);
176 :
177 64 : if (found_consistent_snapshot)
178 10 : *found_consistent_snapshot = false;
179 :
180 64 : if (remote_slot_precedes)
181 10 : *remote_slot_precedes = false;
182 :
183 : /*
184 : * Don't overwrite if we already have a newer catalog_xmin and
185 : * restart_lsn.
186 : */
187 128 : if (remote_slot->restart_lsn < slot->data.restart_lsn ||
188 64 : TransactionIdPrecedes(remote_slot->catalog_xmin,
189 : slot->data.catalog_xmin))
190 : {
191 : /*
192 : * This can happen in following situations:
193 : *
194 : * If the slot is temporary, it means either the initial WAL location
195 : * reserved for the local slot is ahead of the remote slot's
196 : * restart_lsn or the initial xmin_horizon computed for the local slot
197 : * is ahead of the remote slot.
198 : *
199 : * If the slot is persistent, restart_lsn of the synced slot could
200 : * still be ahead of the remote slot. Since we use slot advance
201 : * functionality to keep snapbuild/slot updated, it is possible that
202 : * the restart_lsn is advanced to a later position than it has on the
203 : * primary. This can happen when slot advancing machinery finds
204 : * running xacts record after reaching the consistent state at a later
205 : * point than the primary where it serializes the snapshot and updates
206 : * the restart_lsn.
207 : *
208 : * We LOG the message if the slot is temporary as it can help the user
209 : * to understand why the slot is not sync-ready. In the case of a
210 : * persistent slot, it would be a more common case and won't directly
211 : * impact the users, so we used DEBUG1 level to log the message.
212 : */
213 0 : ereport(slot->data.persistency == RS_TEMPORARY ? LOG : DEBUG1,
214 : errmsg("could not synchronize replication slot \"%s\" because remote slot precedes local slot",
215 : remote_slot->name),
216 : errdetail("The remote slot has LSN %X/%X and catalog xmin %u, but the local slot has LSN %X/%X and catalog xmin %u.",
217 : LSN_FORMAT_ARGS(remote_slot->restart_lsn),
218 : remote_slot->catalog_xmin,
219 : LSN_FORMAT_ARGS(slot->data.restart_lsn),
220 : slot->data.catalog_xmin));
221 :
222 0 : if (remote_slot_precedes)
223 0 : *remote_slot_precedes = true;
224 : }
225 :
226 : /*
227 : * Attempt to sync LSNs and xmins only if remote slot is ahead of local
228 : * slot.
229 : */
230 64 : else if (remote_slot->confirmed_lsn > slot->data.confirmed_flush ||
231 86 : remote_slot->restart_lsn > slot->data.restart_lsn ||
232 42 : TransactionIdFollows(remote_slot->catalog_xmin,
233 : slot->data.catalog_xmin))
234 : {
235 : /*
236 : * We can't directly copy the remote slot's LSN or xmin unless there
237 : * exists a consistent snapshot at that point. Otherwise, after
238 : * promotion, the slots may not reach a consistent point before the
239 : * confirmed_flush_lsn which can lead to a data loss. To avoid data
240 : * loss, we let slot machinery advance the slot which ensures that
241 : * snapbuilder/slot statuses are updated properly.
242 : */
243 22 : if (SnapBuildSnapshotExists(remote_slot->restart_lsn))
244 : {
245 : /*
246 : * Update the slot info directly if there is a serialized snapshot
247 : * at the restart_lsn, as the slot can quickly reach consistency
248 : * at restart_lsn by restoring the snapshot.
249 : */
250 4 : SpinLockAcquire(&slot->mutex);
251 4 : slot->data.restart_lsn = remote_slot->restart_lsn;
252 4 : slot->data.confirmed_flush = remote_slot->confirmed_lsn;
253 4 : slot->data.catalog_xmin = remote_slot->catalog_xmin;
254 4 : SpinLockRelease(&slot->mutex);
255 :
256 4 : if (found_consistent_snapshot)
257 0 : *found_consistent_snapshot = true;
258 : }
259 : else
260 : {
261 18 : LogicalSlotAdvanceAndCheckSnapState(remote_slot->confirmed_lsn,
262 : found_consistent_snapshot);
263 :
264 : /* Sanity check */
265 18 : if (slot->data.confirmed_flush != remote_slot->confirmed_lsn)
266 0 : ereport(ERROR,
267 : errmsg_internal("synchronized confirmed_flush for slot \"%s\" differs from remote slot",
268 : remote_slot->name),
269 : errdetail_internal("Remote slot has LSN %X/%X but local slot has LSN %X/%X.",
270 : LSN_FORMAT_ARGS(remote_slot->confirmed_lsn),
271 : LSN_FORMAT_ARGS(slot->data.confirmed_flush)));
272 : }
273 :
274 22 : updated_xmin_or_lsn = true;
275 : }
276 :
277 64 : if (remote_dbid != slot->data.database ||
278 64 : remote_slot->two_phase != slot->data.two_phase ||
279 62 : remote_slot->failover != slot->data.failover ||
280 62 : strcmp(remote_slot->plugin, NameStr(slot->data.plugin)) != 0 ||
281 62 : remote_slot->two_phase_at != slot->data.two_phase_at)
282 : {
283 : NameData plugin_name;
284 :
285 : /* Avoid expensive operations while holding a spinlock. */
286 2 : namestrcpy(&plugin_name, remote_slot->plugin);
287 :
288 2 : SpinLockAcquire(&slot->mutex);
289 2 : slot->data.plugin = plugin_name;
290 2 : slot->data.database = remote_dbid;
291 2 : slot->data.two_phase = remote_slot->two_phase;
292 2 : slot->data.two_phase_at = remote_slot->two_phase_at;
293 2 : slot->data.failover = remote_slot->failover;
294 2 : SpinLockRelease(&slot->mutex);
295 :
296 2 : updated_config = true;
297 : }
298 :
299 : /*
300 : * We have to write the changed xmin to disk *before* we change the
301 : * in-memory value, otherwise after a crash we wouldn't know that some
302 : * catalog tuples might have been removed already.
303 : */
304 64 : if (updated_config || updated_xmin_or_lsn)
305 : {
306 22 : ReplicationSlotMarkDirty();
307 22 : ReplicationSlotSave();
308 : }
309 :
310 : /*
311 : * Now the new xmin is safely on disk, we can let the global value
312 : * advance. We do not take ProcArrayLock or similar since we only advance
313 : * xmin here and there's not much harm done by a concurrent computation
314 : * missing that.
315 : */
316 64 : if (updated_xmin_or_lsn)
317 : {
318 22 : SpinLockAcquire(&slot->mutex);
319 22 : slot->effective_catalog_xmin = remote_slot->catalog_xmin;
320 22 : SpinLockRelease(&slot->mutex);
321 :
322 22 : ReplicationSlotsComputeRequiredXmin(false);
323 22 : ReplicationSlotsComputeRequiredLSN();
324 : }
325 :
326 64 : return updated_config || updated_xmin_or_lsn;
327 : }
328 :
329 : /*
330 : * Get the list of local logical slots that are synchronized from the
331 : * primary server.
332 : */
333 : static List *
334 36 : get_local_synced_slots(void)
335 : {
336 36 : List *local_slots = NIL;
337 :
338 36 : LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
339 :
340 396 : for (int i = 0; i < max_replication_slots; i++)
341 : {
342 360 : ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
343 :
344 : /* Check if it is a synchronized slot */
345 360 : if (s->in_use && s->data.synced)
346 : {
347 : Assert(SlotIsLogical(s));
348 60 : local_slots = lappend(local_slots, s);
349 : }
350 : }
351 :
352 36 : LWLockRelease(ReplicationSlotControlLock);
353 :
354 36 : return local_slots;
355 : }
356 :
357 : /*
358 : * Helper function to check if local_slot is required to be retained.
359 : *
360 : * Return false either if local_slot does not exist in the remote_slots list
361 : * or is invalidated while the corresponding remote slot is still valid,
362 : * otherwise true.
363 : */
364 : static bool
365 60 : local_sync_slot_required(ReplicationSlot *local_slot, List *remote_slots)
366 : {
367 60 : bool remote_exists = false;
368 60 : bool locally_invalidated = false;
369 :
370 148 : foreach_ptr(RemoteSlot, remote_slot, remote_slots)
371 : {
372 86 : if (strcmp(remote_slot->name, NameStr(local_slot->data.name)) == 0)
373 : {
374 58 : remote_exists = true;
375 :
376 : /*
377 : * If remote slot is not invalidated but local slot is marked as
378 : * invalidated, then set locally_invalidated flag.
379 : */
380 58 : SpinLockAcquire(&local_slot->mutex);
381 58 : locally_invalidated =
382 116 : (remote_slot->invalidated == RS_INVAL_NONE) &&
383 58 : (local_slot->data.invalidated != RS_INVAL_NONE);
384 58 : SpinLockRelease(&local_slot->mutex);
385 :
386 58 : break;
387 : }
388 : }
389 :
390 60 : return (remote_exists && !locally_invalidated);
391 : }
392 :
393 : /*
394 : * Drop local obsolete slots.
395 : *
396 : * Drop the local slots that no longer need to be synced i.e. these either do
397 : * not exist on the primary or are no longer enabled for failover.
398 : *
399 : * Additionally, drop any slots that are valid on the primary but got
400 : * invalidated on the standby. This situation may occur due to the following
401 : * reasons:
402 : * - The 'max_slot_wal_keep_size' on the standby is insufficient to retain WAL
403 : * records from the restart_lsn of the slot.
404 : * - 'primary_slot_name' is temporarily reset to null and the physical slot is
405 : * removed.
406 : * These dropped slots will get recreated in next sync-cycle and it is okay to
407 : * drop and recreate such slots as long as these are not consumable on the
408 : * standby (which is the case currently).
409 : *
410 : * Note: Change of 'wal_level' on the primary server to a level lower than
411 : * logical may also result in slot invalidation and removal on the standby.
412 : * This is because such 'wal_level' change is only possible if the logical
413 : * slots are removed on the primary server, so it's expected to see the
414 : * slots being invalidated and removed on the standby too (and re-created
415 : * if they are re-created on the primary server).
416 : */
417 : static void
418 36 : drop_local_obsolete_slots(List *remote_slot_list)
419 : {
420 36 : List *local_slots = get_local_synced_slots();
421 :
422 132 : foreach_ptr(ReplicationSlot, local_slot, local_slots)
423 : {
424 : /* Drop the local slot if it is not required to be retained. */
425 60 : if (!local_sync_slot_required(local_slot, remote_slot_list))
426 : {
427 : bool synced_slot;
428 :
429 : /*
430 : * Use shared lock to prevent a conflict with
431 : * ReplicationSlotsDropDBSlots(), trying to drop the same slot
432 : * during a drop-database operation.
433 : */
434 4 : LockSharedObject(DatabaseRelationId, local_slot->data.database,
435 : 0, AccessShareLock);
436 :
437 : /*
438 : * In the small window between getting the slot to drop and
439 : * locking the database, there is a possibility of a parallel
440 : * database drop by the startup process and the creation of a new
441 : * slot by the user. This new user-created slot may end up using
442 : * the same shared memory as that of 'local_slot'. Thus check if
443 : * local_slot is still the synced one before performing actual
444 : * drop.
445 : */
446 4 : SpinLockAcquire(&local_slot->mutex);
447 4 : synced_slot = local_slot->in_use && local_slot->data.synced;
448 4 : SpinLockRelease(&local_slot->mutex);
449 :
450 4 : if (synced_slot)
451 : {
452 4 : ReplicationSlotAcquire(NameStr(local_slot->data.name), true, false);
453 4 : ReplicationSlotDropAcquired();
454 : }
455 :
456 4 : UnlockSharedObject(DatabaseRelationId, local_slot->data.database,
457 : 0, AccessShareLock);
458 :
459 4 : ereport(LOG,
460 : errmsg("dropped replication slot \"%s\" of database with OID %u",
461 : NameStr(local_slot->data.name),
462 : local_slot->data.database));
463 : }
464 : }
465 36 : }
466 :
467 : /*
468 : * Reserve WAL for the currently active local slot using the specified WAL
469 : * location (restart_lsn).
470 : *
471 : * If the given WAL location has been removed, reserve WAL using the oldest
472 : * existing WAL segment.
473 : */
474 : static void
475 10 : reserve_wal_for_local_slot(XLogRecPtr restart_lsn)
476 : {
477 : XLogSegNo oldest_segno;
478 : XLogSegNo segno;
479 10 : ReplicationSlot *slot = MyReplicationSlot;
480 :
481 : Assert(slot != NULL);
482 : Assert(XLogRecPtrIsInvalid(slot->data.restart_lsn));
483 :
484 : while (true)
485 : {
486 10 : SpinLockAcquire(&slot->mutex);
487 10 : slot->data.restart_lsn = restart_lsn;
488 10 : SpinLockRelease(&slot->mutex);
489 :
490 : /* Prevent WAL removal as fast as possible */
491 10 : ReplicationSlotsComputeRequiredLSN();
492 :
493 10 : XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size);
494 :
495 : /*
496 : * Find the oldest existing WAL segment file.
497 : *
498 : * Normally, we can determine it by using the last removed segment
499 : * number. However, if no WAL segment files have been removed by a
500 : * checkpoint since startup, we need to search for the oldest segment
501 : * file from the current timeline existing in XLOGDIR.
502 : *
503 : * XXX: Currently, we are searching for the oldest segment in the
504 : * current timeline as there is less chance of the slot's restart_lsn
505 : * from being some prior timeline, and even if it happens, in the
506 : * worst case, we will wait to sync till the slot's restart_lsn moved
507 : * to the current timeline.
508 : */
509 10 : oldest_segno = XLogGetLastRemovedSegno() + 1;
510 :
511 10 : if (oldest_segno == 1)
512 : {
513 : TimeLineID cur_timeline;
514 :
515 6 : GetWalRcvFlushRecPtr(NULL, &cur_timeline);
516 6 : oldest_segno = XLogGetOldestSegno(cur_timeline);
517 : }
518 :
519 10 : elog(DEBUG1, "segno: " UINT64_FORMAT " of purposed restart_lsn for the synced slot, oldest_segno: " UINT64_FORMAT " available",
520 : segno, oldest_segno);
521 :
522 : /*
523 : * If all required WAL is still there, great, otherwise retry. The
524 : * slot should prevent further removal of WAL, unless there's a
525 : * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
526 : * the new restart_lsn above, so normally we should never need to loop
527 : * more than twice.
528 : */
529 10 : if (segno >= oldest_segno)
530 10 : break;
531 :
532 : /* Retry using the location of the oldest wal segment */
533 0 : XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size, restart_lsn);
534 : }
535 10 : }
536 :
537 : /*
538 : * If the remote restart_lsn and catalog_xmin have caught up with the
539 : * local ones, then update the LSNs and persist the local synced slot for
540 : * future synchronization; otherwise, do nothing.
541 : *
542 : * Return true if the slot is marked as RS_PERSISTENT (sync-ready), otherwise
543 : * false.
544 : */
545 : static bool
546 10 : update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid)
547 : {
548 10 : ReplicationSlot *slot = MyReplicationSlot;
549 10 : bool found_consistent_snapshot = false;
550 10 : bool remote_slot_precedes = false;
551 :
552 10 : (void) update_local_synced_slot(remote_slot, remote_dbid,
553 : &found_consistent_snapshot,
554 : &remote_slot_precedes);
555 :
556 : /*
557 : * Check if the primary server has caught up. Refer to the comment atop
558 : * the file for details on this check.
559 : */
560 10 : if (remote_slot_precedes)
561 : {
562 : /*
563 : * The remote slot didn't catch up to locally reserved position.
564 : *
565 : * We do not drop the slot because the restart_lsn can be ahead of the
566 : * current location when recreating the slot in the next cycle. It may
567 : * take more time to create such a slot. Therefore, we keep this slot
568 : * and attempt the synchronization in the next cycle.
569 : */
570 0 : return false;
571 : }
572 :
573 : /*
574 : * Don't persist the slot if it cannot reach the consistent point from the
575 : * restart_lsn. See comments atop this file.
576 : */
577 10 : if (!found_consistent_snapshot)
578 : {
579 0 : ereport(LOG,
580 : errmsg("could not synchronize replication slot \"%s\"", remote_slot->name),
581 : errdetail("Logical decoding could not find consistent point from local slot's LSN %X/%X.",
582 : LSN_FORMAT_ARGS(slot->data.restart_lsn)));
583 :
584 0 : return false;
585 : }
586 :
587 10 : ReplicationSlotPersist();
588 :
589 10 : ereport(LOG,
590 : errmsg("newly created replication slot \"%s\" is sync-ready now",
591 : remote_slot->name));
592 :
593 10 : return true;
594 : }
595 :
596 : /*
597 : * Synchronize a single slot to the given position.
598 : *
599 : * This creates a new slot if there is no existing one and updates the
600 : * metadata of the slot as per the data received from the primary server.
601 : *
602 : * The slot is created as a temporary slot and stays in the same state until the
603 : * remote_slot catches up with locally reserved position and local slot is
604 : * updated. The slot is then persisted and is considered as sync-ready for
605 : * periodic syncs.
606 : *
607 : * Returns TRUE if the local slot is updated.
608 : */
609 : static bool
610 66 : synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid)
611 : {
612 : ReplicationSlot *slot;
613 : XLogRecPtr latestFlushPtr;
614 66 : bool slot_updated = false;
615 :
616 : /*
617 : * Make sure that concerned WAL is received and flushed before syncing
618 : * slot to target lsn received from the primary server.
619 : */
620 66 : latestFlushPtr = GetStandbyFlushRecPtr(NULL);
621 66 : if (remote_slot->confirmed_lsn > latestFlushPtr)
622 : {
623 : /*
624 : * Can get here only if GUC 'synchronized_standby_slots' on the
625 : * primary server was not configured correctly.
626 : */
627 2 : ereport(AmLogicalSlotSyncWorkerProcess() ? LOG : ERROR,
628 : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
629 : errmsg("skipping slot synchronization because the received slot sync"
630 : " LSN %X/%X for slot \"%s\" is ahead of the standby position %X/%X",
631 : LSN_FORMAT_ARGS(remote_slot->confirmed_lsn),
632 : remote_slot->name,
633 : LSN_FORMAT_ARGS(latestFlushPtr)));
634 :
635 2 : return false;
636 : }
637 :
638 : /* Search for the named slot */
639 64 : if ((slot = SearchNamedReplicationSlot(remote_slot->name, true)))
640 : {
641 : bool synced;
642 :
643 54 : SpinLockAcquire(&slot->mutex);
644 54 : synced = slot->data.synced;
645 54 : SpinLockRelease(&slot->mutex);
646 :
647 : /* User-created slot with the same name exists, raise ERROR. */
648 54 : if (!synced)
649 0 : ereport(ERROR,
650 : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
651 : errmsg("exiting from slot synchronization because same"
652 : " name slot \"%s\" already exists on the standby",
653 : remote_slot->name));
654 :
655 : /*
656 : * The slot has been synchronized before.
657 : *
658 : * It is important to acquire the slot here before checking
659 : * invalidation. If we don't acquire the slot first, there could be a
660 : * race condition that the local slot could be invalidated just after
661 : * checking the 'invalidated' flag here and we could end up
662 : * overwriting 'invalidated' flag to remote_slot's value. See
663 : * InvalidatePossiblyObsoleteSlot() where it invalidates slot directly
664 : * if the slot is not acquired by other processes.
665 : *
666 : * XXX: If it ever turns out that slot acquire/release is costly for
667 : * cases when none of the slot properties is changed then we can do a
668 : * pre-check to ensure that at least one of the slot properties is
669 : * changed before acquiring the slot.
670 : */
671 54 : ReplicationSlotAcquire(remote_slot->name, true, false);
672 :
673 : Assert(slot == MyReplicationSlot);
674 :
675 : /*
676 : * Copy the invalidation cause from remote only if local slot is not
677 : * invalidated locally, we don't want to overwrite existing one.
678 : */
679 54 : if (slot->data.invalidated == RS_INVAL_NONE &&
680 54 : remote_slot->invalidated != RS_INVAL_NONE)
681 : {
682 0 : SpinLockAcquire(&slot->mutex);
683 0 : slot->data.invalidated = remote_slot->invalidated;
684 0 : SpinLockRelease(&slot->mutex);
685 :
686 : /* Make sure the invalidated state persists across server restart */
687 0 : ReplicationSlotMarkDirty();
688 0 : ReplicationSlotSave();
689 :
690 0 : slot_updated = true;
691 : }
692 :
693 : /* Skip the sync of an invalidated slot */
694 54 : if (slot->data.invalidated != RS_INVAL_NONE)
695 : {
696 0 : ReplicationSlotRelease();
697 0 : return slot_updated;
698 : }
699 :
700 : /* Slot not ready yet, let's attempt to make it sync-ready now. */
701 54 : if (slot->data.persistency == RS_TEMPORARY)
702 : {
703 0 : slot_updated = update_and_persist_local_synced_slot(remote_slot,
704 : remote_dbid);
705 : }
706 :
707 : /* Slot ready for sync, so sync it. */
708 : else
709 : {
710 : /*
711 : * Sanity check: As long as the invalidations are handled
712 : * appropriately as above, this should never happen.
713 : *
714 : * We don't need to check restart_lsn here. See the comments in
715 : * update_local_synced_slot() for details.
716 : */
717 54 : if (remote_slot->confirmed_lsn < slot->data.confirmed_flush)
718 0 : ereport(ERROR,
719 : errmsg_internal("cannot synchronize local slot \"%s\"",
720 : remote_slot->name),
721 : errdetail_internal("Local slot's start streaming location LSN(%X/%X) is ahead of remote slot's LSN(%X/%X).",
722 : LSN_FORMAT_ARGS(slot->data.confirmed_flush),
723 : LSN_FORMAT_ARGS(remote_slot->confirmed_lsn)));
724 :
725 54 : slot_updated = update_local_synced_slot(remote_slot, remote_dbid,
726 : NULL, NULL);
727 : }
728 : }
729 : /* Otherwise create the slot first. */
730 : else
731 : {
732 : NameData plugin_name;
733 10 : TransactionId xmin_horizon = InvalidTransactionId;
734 :
735 : /* Skip creating the local slot if remote_slot is invalidated already */
736 10 : if (remote_slot->invalidated != RS_INVAL_NONE)
737 0 : return false;
738 :
739 : /*
740 : * We create temporary slots instead of ephemeral slots here because
741 : * we want the slots to survive after releasing them. This is done to
742 : * avoid dropping and re-creating the slots in each synchronization
743 : * cycle if the restart_lsn or catalog_xmin of the remote slot has not
744 : * caught up.
745 : */
746 10 : ReplicationSlotCreate(remote_slot->name, true, RS_TEMPORARY,
747 10 : remote_slot->two_phase,
748 10 : remote_slot->failover,
749 : true);
750 :
751 : /* For shorter lines. */
752 10 : slot = MyReplicationSlot;
753 :
754 : /* Avoid expensive operations while holding a spinlock. */
755 10 : namestrcpy(&plugin_name, remote_slot->plugin);
756 :
757 10 : SpinLockAcquire(&slot->mutex);
758 10 : slot->data.database = remote_dbid;
759 10 : slot->data.plugin = plugin_name;
760 10 : SpinLockRelease(&slot->mutex);
761 :
762 10 : reserve_wal_for_local_slot(remote_slot->restart_lsn);
763 :
764 10 : LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
765 10 : xmin_horizon = GetOldestSafeDecodingTransactionId(true);
766 10 : SpinLockAcquire(&slot->mutex);
767 10 : slot->effective_catalog_xmin = xmin_horizon;
768 10 : slot->data.catalog_xmin = xmin_horizon;
769 10 : SpinLockRelease(&slot->mutex);
770 10 : ReplicationSlotsComputeRequiredXmin(true);
771 10 : LWLockRelease(ProcArrayLock);
772 :
773 10 : update_and_persist_local_synced_slot(remote_slot, remote_dbid);
774 :
775 10 : slot_updated = true;
776 : }
777 :
778 64 : ReplicationSlotRelease();
779 :
780 64 : return slot_updated;
781 : }
782 :
783 : /*
784 : * Synchronize slots.
785 : *
786 : * Gets the failover logical slots info from the primary server and updates
787 : * the slots locally. Creates the slots if not present on the standby.
788 : *
789 : * Returns TRUE if any of the slots gets updated in this sync-cycle.
790 : */
791 : static bool
792 36 : synchronize_slots(WalReceiverConn *wrconn)
793 : {
794 : #define SLOTSYNC_COLUMN_COUNT 10
795 36 : Oid slotRow[SLOTSYNC_COLUMN_COUNT] = {TEXTOID, TEXTOID, LSNOID,
796 : LSNOID, XIDOID, BOOLOID, LSNOID, BOOLOID, TEXTOID, TEXTOID};
797 :
798 : WalRcvExecResult *res;
799 : TupleTableSlot *tupslot;
800 36 : List *remote_slot_list = NIL;
801 36 : bool some_slot_updated = false;
802 36 : bool started_tx = false;
803 36 : const char *query = "SELECT slot_name, plugin, confirmed_flush_lsn,"
804 : " restart_lsn, catalog_xmin, two_phase, two_phase_at, failover,"
805 : " database, invalidation_reason"
806 : " FROM pg_catalog.pg_replication_slots"
807 : " WHERE failover and NOT temporary";
808 :
809 : /* The syscache access in walrcv_exec() needs a transaction env. */
810 36 : if (!IsTransactionState())
811 : {
812 22 : StartTransactionCommand();
813 22 : started_tx = true;
814 : }
815 :
816 : /* Execute the query */
817 36 : res = walrcv_exec(wrconn, query, SLOTSYNC_COLUMN_COUNT, slotRow);
818 36 : if (res->status != WALRCV_OK_TUPLES)
819 0 : ereport(ERROR,
820 : errmsg("could not fetch failover logical slots info from the primary server: %s",
821 : res->err));
822 :
823 : /* Construct the remote_slot tuple and synchronize each slot locally */
824 36 : tupslot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple);
825 102 : while (tuplestore_gettupleslot(res->tuplestore, true, false, tupslot))
826 : {
827 : bool isnull;
828 66 : RemoteSlot *remote_slot = palloc0(sizeof(RemoteSlot));
829 : Datum d;
830 66 : int col = 0;
831 :
832 66 : remote_slot->name = TextDatumGetCString(slot_getattr(tupslot, ++col,
833 : &isnull));
834 : Assert(!isnull);
835 :
836 66 : remote_slot->plugin = TextDatumGetCString(slot_getattr(tupslot, ++col,
837 : &isnull));
838 : Assert(!isnull);
839 :
840 : /*
841 : * It is possible to get null values for LSN and Xmin if slot is
842 : * invalidated on the primary server, so handle accordingly.
843 : */
844 66 : d = slot_getattr(tupslot, ++col, &isnull);
845 66 : remote_slot->confirmed_lsn = isnull ? InvalidXLogRecPtr :
846 66 : DatumGetLSN(d);
847 :
848 66 : d = slot_getattr(tupslot, ++col, &isnull);
849 66 : remote_slot->restart_lsn = isnull ? InvalidXLogRecPtr : DatumGetLSN(d);
850 :
851 66 : d = slot_getattr(tupslot, ++col, &isnull);
852 66 : remote_slot->catalog_xmin = isnull ? InvalidTransactionId :
853 66 : DatumGetTransactionId(d);
854 :
855 66 : remote_slot->two_phase = DatumGetBool(slot_getattr(tupslot, ++col,
856 : &isnull));
857 : Assert(!isnull);
858 :
859 66 : d = slot_getattr(tupslot, ++col, &isnull);
860 66 : remote_slot->two_phase_at = isnull ? InvalidXLogRecPtr : DatumGetLSN(d);
861 :
862 66 : remote_slot->failover = DatumGetBool(slot_getattr(tupslot, ++col,
863 : &isnull));
864 : Assert(!isnull);
865 :
866 66 : remote_slot->database = TextDatumGetCString(slot_getattr(tupslot,
867 : ++col, &isnull));
868 : Assert(!isnull);
869 :
870 66 : d = slot_getattr(tupslot, ++col, &isnull);
871 66 : remote_slot->invalidated = isnull ? RS_INVAL_NONE :
872 0 : GetSlotInvalidationCause(TextDatumGetCString(d));
873 :
874 : /* Sanity check */
875 : Assert(col == SLOTSYNC_COLUMN_COUNT);
876 :
877 : /*
878 : * If restart_lsn, confirmed_lsn or catalog_xmin is invalid but the
879 : * slot is valid, that means we have fetched the remote_slot in its
880 : * RS_EPHEMERAL state. In such a case, don't sync it; we can always
881 : * sync it in the next sync cycle when the remote_slot is persisted
882 : * and has valid lsn(s) and xmin values.
883 : *
884 : * XXX: In future, if we plan to expose 'slot->data.persistency' in
885 : * pg_replication_slots view, then we can avoid fetching RS_EPHEMERAL
886 : * slots in the first place.
887 : */
888 66 : if ((XLogRecPtrIsInvalid(remote_slot->restart_lsn) ||
889 66 : XLogRecPtrIsInvalid(remote_slot->confirmed_lsn) ||
890 66 : !TransactionIdIsValid(remote_slot->catalog_xmin)) &&
891 0 : remote_slot->invalidated == RS_INVAL_NONE)
892 0 : pfree(remote_slot);
893 : else
894 : /* Create list of remote slots */
895 66 : remote_slot_list = lappend(remote_slot_list, remote_slot);
896 :
897 66 : ExecClearTuple(tupslot);
898 : }
899 :
900 : /* Drop local slots that no longer need to be synced. */
901 36 : drop_local_obsolete_slots(remote_slot_list);
902 :
903 : /* Now sync the slots locally */
904 138 : foreach_ptr(RemoteSlot, remote_slot, remote_slot_list)
905 : {
906 66 : Oid remote_dbid = get_database_oid(remote_slot->database, false);
907 :
908 : /*
909 : * Use shared lock to prevent a conflict with
910 : * ReplicationSlotsDropDBSlots(), trying to drop the same slot during
911 : * a drop-database operation.
912 : */
913 66 : LockSharedObject(DatabaseRelationId, remote_dbid, 0, AccessShareLock);
914 :
915 66 : some_slot_updated |= synchronize_one_slot(remote_slot, remote_dbid);
916 :
917 66 : UnlockSharedObject(DatabaseRelationId, remote_dbid, 0, AccessShareLock);
918 : }
919 :
920 : /* We are done, free remote_slot_list elements */
921 36 : list_free_deep(remote_slot_list);
922 :
923 36 : walrcv_clear_result(res);
924 :
925 36 : if (started_tx)
926 22 : CommitTransactionCommand();
927 :
928 36 : return some_slot_updated;
929 : }
930 :
931 : /*
932 : * Checks the remote server info.
933 : *
934 : * We ensure that the 'primary_slot_name' exists on the remote server and the
935 : * remote server is not a standby node.
936 : */
937 : static void
938 24 : validate_remote_info(WalReceiverConn *wrconn)
939 : {
940 : #define PRIMARY_INFO_OUTPUT_COL_COUNT 2
941 : WalRcvExecResult *res;
942 24 : Oid slotRow[PRIMARY_INFO_OUTPUT_COL_COUNT] = {BOOLOID, BOOLOID};
943 : StringInfoData cmd;
944 : bool isnull;
945 : TupleTableSlot *tupslot;
946 : bool remote_in_recovery;
947 : bool primary_slot_valid;
948 24 : bool started_tx = false;
949 :
950 24 : initStringInfo(&cmd);
951 24 : appendStringInfo(&cmd,
952 : "SELECT pg_is_in_recovery(), count(*) = 1"
953 : " FROM pg_catalog.pg_replication_slots"
954 : " WHERE slot_type='physical' AND slot_name=%s",
955 : quote_literal_cstr(PrimarySlotName));
956 :
957 : /* The syscache access in walrcv_exec() needs a transaction env. */
958 24 : if (!IsTransactionState())
959 : {
960 8 : StartTransactionCommand();
961 8 : started_tx = true;
962 : }
963 :
964 24 : res = walrcv_exec(wrconn, cmd.data, PRIMARY_INFO_OUTPUT_COL_COUNT, slotRow);
965 24 : pfree(cmd.data);
966 :
967 24 : if (res->status != WALRCV_OK_TUPLES)
968 0 : ereport(ERROR,
969 : errmsg("could not fetch primary slot name \"%s\" info from the primary server: %s",
970 : PrimarySlotName, res->err),
971 : errhint("Check if \"primary_slot_name\" is configured correctly."));
972 :
973 24 : tupslot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple);
974 24 : if (!tuplestore_gettupleslot(res->tuplestore, true, false, tupslot))
975 0 : elog(ERROR,
976 : "failed to fetch tuple for the primary server slot specified by \"primary_slot_name\"");
977 :
978 24 : remote_in_recovery = DatumGetBool(slot_getattr(tupslot, 1, &isnull));
979 : Assert(!isnull);
980 :
981 : /*
982 : * Slot sync is currently not supported on a cascading standby. This is
983 : * because if we allow it, the primary server needs to wait for all the
984 : * cascading standbys, otherwise, logical subscribers can still be ahead
985 : * of one of the cascading standbys which we plan to promote. Thus, to
986 : * avoid this additional complexity, we restrict it for the time being.
987 : */
988 24 : if (remote_in_recovery)
989 2 : ereport(ERROR,
990 : errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
991 : errmsg("cannot synchronize replication slots from a standby server"));
992 :
993 22 : primary_slot_valid = DatumGetBool(slot_getattr(tupslot, 2, &isnull));
994 : Assert(!isnull);
995 :
996 22 : if (!primary_slot_valid)
997 0 : ereport(ERROR,
998 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
999 : /* translator: second %s is a GUC variable name */
1000 : errmsg("replication slot \"%s\" specified by \"%s\" does not exist on primary server",
1001 : PrimarySlotName, "primary_slot_name"));
1002 :
1003 22 : ExecClearTuple(tupslot);
1004 22 : walrcv_clear_result(res);
1005 :
1006 22 : if (started_tx)
1007 8 : CommitTransactionCommand();
1008 22 : }
1009 :
1010 : /*
1011 : * Checks if dbname is specified in 'primary_conninfo'.
1012 : *
1013 : * Error out if not specified otherwise return it.
1014 : */
1015 : char *
1016 26 : CheckAndGetDbnameFromConninfo(void)
1017 : {
1018 : char *dbname;
1019 :
1020 : /*
1021 : * The slot synchronization needs a database connection for walrcv_exec to
1022 : * work.
1023 : */
1024 26 : dbname = walrcv_get_dbname_from_conninfo(PrimaryConnInfo);
1025 26 : if (dbname == NULL)
1026 2 : ereport(ERROR,
1027 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1028 :
1029 : /*
1030 : * translator: first %s is a connection option; second %s is a GUC
1031 : * variable name
1032 : */
1033 : errmsg("replication slot synchronization requires \"%s\" to be specified in \"%s\"",
1034 : "dbname", "primary_conninfo"));
1035 24 : return dbname;
1036 : }
1037 :
1038 : /*
1039 : * Return true if all necessary GUCs for slot synchronization are set
1040 : * appropriately, otherwise, return false.
1041 : */
1042 : bool
1043 30 : ValidateSlotSyncParams(int elevel)
1044 : {
1045 : /*
1046 : * Logical slot sync/creation requires wal_level >= logical.
1047 : *
1048 : * Since altering the wal_level requires a server restart, so error out in
1049 : * this case regardless of elevel provided by caller.
1050 : */
1051 30 : if (wal_level < WAL_LEVEL_LOGICAL)
1052 0 : ereport(ERROR,
1053 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1054 : errmsg("replication slot synchronization requires \"wal_level\" >= \"logical\""));
1055 :
1056 : /*
1057 : * A physical replication slot(primary_slot_name) is required on the
1058 : * primary to ensure that the rows needed by the standby are not removed
1059 : * after restarting, so that the synchronized slot on the standby will not
1060 : * be invalidated.
1061 : */
1062 30 : if (PrimarySlotName == NULL || *PrimarySlotName == '\0')
1063 : {
1064 0 : ereport(elevel,
1065 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1066 : /* translator: %s is a GUC variable name */
1067 : errmsg("replication slot synchronization requires \"%s\" to be set", "primary_slot_name"));
1068 0 : return false;
1069 : }
1070 :
1071 : /*
1072 : * hot_standby_feedback must be enabled to cooperate with the physical
1073 : * replication slot, which allows informing the primary about the xmin and
1074 : * catalog_xmin values on the standby.
1075 : */
1076 30 : if (!hot_standby_feedback)
1077 : {
1078 2 : ereport(elevel,
1079 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1080 : /* translator: %s is a GUC variable name */
1081 : errmsg("replication slot synchronization requires \"%s\" to be enabled",
1082 : "hot_standby_feedback"));
1083 2 : return false;
1084 : }
1085 :
1086 : /*
1087 : * The primary_conninfo is required to make connection to primary for
1088 : * getting slots information.
1089 : */
1090 28 : if (PrimaryConnInfo == NULL || *PrimaryConnInfo == '\0')
1091 : {
1092 0 : ereport(elevel,
1093 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1094 : /* translator: %s is a GUC variable name */
1095 : errmsg("replication slot synchronization requires \"%s\" to be set",
1096 : "primary_conninfo"));
1097 0 : return false;
1098 : }
1099 :
1100 28 : return true;
1101 : }
1102 :
1103 : /*
1104 : * Re-read the config file.
1105 : *
1106 : * Exit if any of the slot sync GUCs have changed. The postmaster will
1107 : * restart it.
1108 : */
1109 : static void
1110 2 : slotsync_reread_config(void)
1111 : {
1112 2 : char *old_primary_conninfo = pstrdup(PrimaryConnInfo);
1113 2 : char *old_primary_slotname = pstrdup(PrimarySlotName);
1114 2 : bool old_sync_replication_slots = sync_replication_slots;
1115 2 : bool old_hot_standby_feedback = hot_standby_feedback;
1116 : bool conninfo_changed;
1117 : bool primary_slotname_changed;
1118 :
1119 : Assert(sync_replication_slots);
1120 :
1121 2 : ConfigReloadPending = false;
1122 2 : ProcessConfigFile(PGC_SIGHUP);
1123 :
1124 2 : conninfo_changed = strcmp(old_primary_conninfo, PrimaryConnInfo) != 0;
1125 2 : primary_slotname_changed = strcmp(old_primary_slotname, PrimarySlotName) != 0;
1126 2 : pfree(old_primary_conninfo);
1127 2 : pfree(old_primary_slotname);
1128 :
1129 2 : if (old_sync_replication_slots != sync_replication_slots)
1130 : {
1131 0 : ereport(LOG,
1132 : /* translator: %s is a GUC variable name */
1133 : errmsg("replication slot synchronization worker will shut down because \"%s\" is disabled", "sync_replication_slots"));
1134 0 : proc_exit(0);
1135 : }
1136 :
1137 2 : if (conninfo_changed ||
1138 2 : primary_slotname_changed ||
1139 2 : (old_hot_standby_feedback != hot_standby_feedback))
1140 : {
1141 2 : ereport(LOG,
1142 : errmsg("replication slot synchronization worker will restart because of a parameter change"));
1143 :
1144 : /*
1145 : * Reset the last-start time for this worker so that the postmaster
1146 : * can restart it without waiting for SLOTSYNC_RESTART_INTERVAL_SEC.
1147 : */
1148 2 : SlotSyncCtx->last_start_time = 0;
1149 :
1150 2 : proc_exit(0);
1151 : }
1152 :
1153 0 : }
1154 :
1155 : /*
1156 : * Interrupt handler for main loop of slot sync worker.
1157 : */
1158 : static void
1159 30 : ProcessSlotSyncInterrupts(WalReceiverConn *wrconn)
1160 : {
1161 30 : CHECK_FOR_INTERRUPTS();
1162 :
1163 26 : if (ShutdownRequestPending)
1164 : {
1165 2 : ereport(LOG,
1166 : errmsg("replication slot synchronization worker is shutting down on receiving SIGINT"));
1167 :
1168 2 : proc_exit(0);
1169 : }
1170 :
1171 24 : if (ConfigReloadPending)
1172 2 : slotsync_reread_config();
1173 22 : }
1174 :
1175 : /*
1176 : * Connection cleanup function for slotsync worker.
1177 : *
1178 : * Called on slotsync worker exit.
1179 : */
1180 : static void
1181 8 : slotsync_worker_disconnect(int code, Datum arg)
1182 : {
1183 8 : WalReceiverConn *wrconn = (WalReceiverConn *) DatumGetPointer(arg);
1184 :
1185 8 : walrcv_disconnect(wrconn);
1186 8 : }
1187 :
1188 : /*
1189 : * Cleanup function for slotsync worker.
1190 : *
1191 : * Called on slotsync worker exit.
1192 : */
1193 : static void
1194 8 : slotsync_worker_onexit(int code, Datum arg)
1195 : {
1196 : /*
1197 : * We need to do slots cleanup here just like WalSndErrorCleanup() does.
1198 : *
1199 : * The startup process during promotion invokes ShutDownSlotSync() which
1200 : * waits for slot sync to finish and it does that by checking the
1201 : * 'syncing' flag. Thus the slot sync worker must be done with slots'
1202 : * release and cleanup to avoid any dangling temporary slots or active
1203 : * slots before it marks itself as finished syncing.
1204 : */
1205 :
1206 : /* Make sure active replication slots are released */
1207 8 : if (MyReplicationSlot != NULL)
1208 0 : ReplicationSlotRelease();
1209 :
1210 : /* Also cleanup the temporary slots. */
1211 8 : ReplicationSlotCleanup(false);
1212 :
1213 8 : SpinLockAcquire(&SlotSyncCtx->mutex);
1214 :
1215 8 : SlotSyncCtx->pid = InvalidPid;
1216 :
1217 : /*
1218 : * If syncing_slots is true, it indicates that the process errored out
1219 : * without resetting the flag. So, we need to clean up shared memory and
1220 : * reset the flag here.
1221 : */
1222 8 : if (syncing_slots)
1223 : {
1224 8 : SlotSyncCtx->syncing = false;
1225 8 : syncing_slots = false;
1226 : }
1227 :
1228 8 : SpinLockRelease(&SlotSyncCtx->mutex);
1229 8 : }
1230 :
1231 : /*
1232 : * Sleep for long enough that we believe it's likely that the slots on primary
1233 : * get updated.
1234 : *
1235 : * If there is no slot activity the wait time between sync-cycles will double
1236 : * (to a maximum of 30s). If there is some slot activity the wait time between
1237 : * sync-cycles is reset to the minimum (200ms).
1238 : */
1239 : static void
1240 22 : wait_for_slot_activity(bool some_slot_updated)
1241 : {
1242 : int rc;
1243 :
1244 22 : if (!some_slot_updated)
1245 : {
1246 : /*
1247 : * No slots were updated, so double the sleep time, but not beyond the
1248 : * maximum allowable value.
1249 : */
1250 12 : sleep_ms = Min(sleep_ms * 2, MAX_SLOTSYNC_WORKER_NAPTIME_MS);
1251 : }
1252 : else
1253 : {
1254 : /*
1255 : * Some slots were updated since the last sleep, so reset the sleep
1256 : * time.
1257 : */
1258 10 : sleep_ms = MIN_SLOTSYNC_WORKER_NAPTIME_MS;
1259 : }
1260 :
1261 22 : rc = WaitLatch(MyLatch,
1262 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1263 : sleep_ms,
1264 : WAIT_EVENT_REPLICATION_SLOTSYNC_MAIN);
1265 :
1266 22 : if (rc & WL_LATCH_SET)
1267 8 : ResetLatch(MyLatch);
1268 22 : }
1269 :
1270 : /*
1271 : * Emit an error if a promotion or a concurrent sync call is in progress.
1272 : * Otherwise, advertise that a sync is in progress.
1273 : */
1274 : static void
1275 24 : check_and_set_sync_info(pid_t worker_pid)
1276 : {
1277 24 : SpinLockAcquire(&SlotSyncCtx->mutex);
1278 :
1279 : /* The worker pid must not be already assigned in SlotSyncCtx */
1280 : Assert(worker_pid == InvalidPid || SlotSyncCtx->pid == InvalidPid);
1281 :
1282 : /*
1283 : * Emit an error if startup process signaled the slot sync machinery to
1284 : * stop. See comments atop SlotSyncCtxStruct.
1285 : */
1286 24 : if (SlotSyncCtx->stopSignaled)
1287 : {
1288 0 : SpinLockRelease(&SlotSyncCtx->mutex);
1289 0 : ereport(ERROR,
1290 : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1291 : errmsg("cannot synchronize replication slots when standby promotion is ongoing"));
1292 : }
1293 :
1294 24 : if (SlotSyncCtx->syncing)
1295 : {
1296 0 : SpinLockRelease(&SlotSyncCtx->mutex);
1297 0 : ereport(ERROR,
1298 : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1299 : errmsg("cannot synchronize replication slots concurrently"));
1300 : }
1301 :
1302 24 : SlotSyncCtx->syncing = true;
1303 :
1304 : /*
1305 : * Advertise the required PID so that the startup process can kill the
1306 : * slot sync worker on promotion.
1307 : */
1308 24 : SlotSyncCtx->pid = worker_pid;
1309 :
1310 24 : SpinLockRelease(&SlotSyncCtx->mutex);
1311 :
1312 24 : syncing_slots = true;
1313 24 : }
1314 :
1315 : /*
1316 : * Reset syncing flag.
1317 : */
1318 : static void
1319 16 : reset_syncing_flag()
1320 : {
1321 16 : SpinLockAcquire(&SlotSyncCtx->mutex);
1322 16 : SlotSyncCtx->syncing = false;
1323 16 : SpinLockRelease(&SlotSyncCtx->mutex);
1324 :
1325 16 : syncing_slots = false;
1326 16 : };
1327 :
1328 : /*
1329 : * The main loop of our worker process.
1330 : *
1331 : * It connects to the primary server, fetches logical failover slots
1332 : * information periodically in order to create and sync the slots.
1333 : */
1334 : void
1335 8 : ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len)
1336 : {
1337 8 : WalReceiverConn *wrconn = NULL;
1338 : char *dbname;
1339 : char *err;
1340 : sigjmp_buf local_sigjmp_buf;
1341 : StringInfoData app_name;
1342 :
1343 : Assert(startup_data_len == 0);
1344 :
1345 8 : MyBackendType = B_SLOTSYNC_WORKER;
1346 :
1347 8 : init_ps_display(NULL);
1348 :
1349 : Assert(GetProcessingMode() == InitProcessing);
1350 :
1351 : /*
1352 : * Create a per-backend PGPROC struct in shared memory. We must do this
1353 : * before we access any shared memory.
1354 : */
1355 8 : InitProcess();
1356 :
1357 : /*
1358 : * Early initialization.
1359 : */
1360 8 : BaseInit();
1361 :
1362 : Assert(SlotSyncCtx != NULL);
1363 :
1364 : /*
1365 : * If an exception is encountered, processing resumes here.
1366 : *
1367 : * We just need to clean up, report the error, and go away.
1368 : *
1369 : * If we do not have this handling here, then since this worker process
1370 : * operates at the bottom of the exception stack, ERRORs turn into FATALs.
1371 : * Therefore, we create our own exception handler to catch ERRORs.
1372 : */
1373 8 : if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1374 : {
1375 : /* since not using PG_TRY, must reset error stack by hand */
1376 0 : error_context_stack = NULL;
1377 :
1378 : /* Prevents interrupts while cleaning up */
1379 0 : HOLD_INTERRUPTS();
1380 :
1381 : /* Report the error to the server log */
1382 0 : EmitErrorReport();
1383 :
1384 : /*
1385 : * We can now go away. Note that because we called InitProcess, a
1386 : * callback was registered to do ProcKill, which will clean up
1387 : * necessary state.
1388 : */
1389 0 : proc_exit(0);
1390 : }
1391 :
1392 : /* We can now handle ereport(ERROR) */
1393 8 : PG_exception_stack = &local_sigjmp_buf;
1394 :
1395 : /* Setup signal handling */
1396 8 : pqsignal(SIGHUP, SignalHandlerForConfigReload);
1397 8 : pqsignal(SIGINT, SignalHandlerForShutdownRequest);
1398 8 : pqsignal(SIGTERM, die);
1399 8 : pqsignal(SIGFPE, FloatExceptionHandler);
1400 8 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1401 8 : pqsignal(SIGUSR2, SIG_IGN);
1402 8 : pqsignal(SIGPIPE, SIG_IGN);
1403 8 : pqsignal(SIGCHLD, SIG_DFL);
1404 :
1405 8 : check_and_set_sync_info(MyProcPid);
1406 :
1407 8 : ereport(LOG, errmsg("slot sync worker started"));
1408 :
1409 : /* Register it as soon as SlotSyncCtx->pid is initialized. */
1410 8 : before_shmem_exit(slotsync_worker_onexit, (Datum) 0);
1411 :
1412 : /*
1413 : * Establishes SIGALRM handler and initialize timeout module. It is needed
1414 : * by InitPostgres to register different timeouts.
1415 : */
1416 8 : InitializeTimeouts();
1417 :
1418 : /* Load the libpq-specific functions */
1419 8 : load_file("libpqwalreceiver", false);
1420 :
1421 : /*
1422 : * Unblock signals (they were blocked when the postmaster forked us)
1423 : */
1424 8 : sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
1425 :
1426 : /*
1427 : * Set always-secure search path, so malicious users can't redirect user
1428 : * code (e.g. operators).
1429 : *
1430 : * It's not strictly necessary since we won't be scanning or writing to
1431 : * any user table locally, but it's good to retain it here for added
1432 : * precaution.
1433 : */
1434 8 : SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
1435 :
1436 8 : dbname = CheckAndGetDbnameFromConninfo();
1437 :
1438 : /*
1439 : * Connect to the database specified by the user in primary_conninfo. We
1440 : * need a database connection for walrcv_exec to work which we use to
1441 : * fetch slot information from the remote node. See comments atop
1442 : * libpqrcv_exec.
1443 : *
1444 : * We do not specify a specific user here since the slot sync worker will
1445 : * operate as a superuser. This is safe because the slot sync worker does
1446 : * not interact with user tables, eliminating the risk of executing
1447 : * arbitrary code within triggers.
1448 : */
1449 8 : InitPostgres(dbname, InvalidOid, NULL, InvalidOid, 0, NULL);
1450 :
1451 8 : SetProcessingMode(NormalProcessing);
1452 :
1453 8 : initStringInfo(&app_name);
1454 8 : if (cluster_name[0])
1455 8 : appendStringInfo(&app_name, "%s_%s", cluster_name, "slotsync worker");
1456 : else
1457 0 : appendStringInfoString(&app_name, "slotsync worker");
1458 :
1459 : /*
1460 : * Establish the connection to the primary server for slot
1461 : * synchronization.
1462 : */
1463 8 : wrconn = walrcv_connect(PrimaryConnInfo, false, false, false,
1464 : app_name.data, &err);
1465 8 : pfree(app_name.data);
1466 :
1467 8 : if (!wrconn)
1468 0 : ereport(ERROR,
1469 : errcode(ERRCODE_CONNECTION_FAILURE),
1470 : errmsg("synchronization worker \"%s\" could not connect to the primary server: %s",
1471 : app_name.data, err));
1472 :
1473 : /*
1474 : * Register the disconnection callback.
1475 : *
1476 : * XXX: This can be combined with previous cleanup registration of
1477 : * slotsync_worker_onexit() but that will need the connection to be made
1478 : * global and we want to avoid introducing global for this purpose.
1479 : */
1480 8 : before_shmem_exit(slotsync_worker_disconnect, PointerGetDatum(wrconn));
1481 :
1482 : /*
1483 : * Using the specified primary server connection, check that we are not a
1484 : * cascading standby and slot configured in 'primary_slot_name' exists on
1485 : * the primary server.
1486 : */
1487 8 : validate_remote_info(wrconn);
1488 :
1489 : /* Main loop to synchronize slots */
1490 : for (;;)
1491 22 : {
1492 30 : bool some_slot_updated = false;
1493 :
1494 30 : ProcessSlotSyncInterrupts(wrconn);
1495 :
1496 22 : some_slot_updated = synchronize_slots(wrconn);
1497 :
1498 22 : wait_for_slot_activity(some_slot_updated);
1499 : }
1500 :
1501 : /*
1502 : * The slot sync worker can't get here because it will only stop when it
1503 : * receives a SIGINT from the startup process, or when there is an error.
1504 : */
1505 : Assert(false);
1506 : }
1507 :
1508 : /*
1509 : * Update the inactive_since property for synced slots.
1510 : *
1511 : * Note that this function is currently called when we shutdown the slot
1512 : * sync machinery.
1513 : */
1514 : static void
1515 1688 : update_synced_slots_inactive_since(void)
1516 : {
1517 1688 : TimestampTz now = 0;
1518 :
1519 : /*
1520 : * We need to update inactive_since only when we are promoting standby to
1521 : * correctly interpret the inactive_since if the standby gets promoted
1522 : * without a restart. We don't want the slots to appear inactive for a
1523 : * long time after promotion if they haven't been synchronized recently.
1524 : * Whoever acquires the slot, i.e., makes the slot active, will reset it.
1525 : */
1526 1688 : if (!StandbyMode)
1527 1596 : return;
1528 :
1529 : /* The slot sync worker or SQL function mustn't be running by now */
1530 : Assert((SlotSyncCtx->pid == InvalidPid) && !SlotSyncCtx->syncing);
1531 :
1532 92 : LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
1533 :
1534 988 : for (int i = 0; i < max_replication_slots; i++)
1535 : {
1536 896 : ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
1537 :
1538 : /* Check if it is a synchronized slot */
1539 896 : if (s->in_use && s->data.synced)
1540 : {
1541 : Assert(SlotIsLogical(s));
1542 :
1543 : /* The slot must not be acquired by any process */
1544 : Assert(s->active_pid == 0);
1545 :
1546 : /* Use the same inactive_since time for all the slots. */
1547 6 : if (now == 0)
1548 4 : now = GetCurrentTimestamp();
1549 :
1550 6 : ReplicationSlotSetInactiveSince(s, now, true);
1551 : }
1552 : }
1553 :
1554 92 : LWLockRelease(ReplicationSlotControlLock);
1555 : }
1556 :
1557 : /*
1558 : * Shut down the slot sync worker.
1559 : *
1560 : * This function sends signal to shutdown slot sync worker, if required. It
1561 : * also waits till the slot sync worker has exited or
1562 : * pg_sync_replication_slots() has finished.
1563 : */
1564 : void
1565 1688 : ShutDownSlotSync(void)
1566 : {
1567 : pid_t worker_pid;
1568 :
1569 1688 : SpinLockAcquire(&SlotSyncCtx->mutex);
1570 :
1571 1688 : SlotSyncCtx->stopSignaled = true;
1572 :
1573 : /*
1574 : * Return if neither the slot sync worker is running nor the function
1575 : * pg_sync_replication_slots() is executing.
1576 : */
1577 1688 : if (!SlotSyncCtx->syncing)
1578 : {
1579 1686 : SpinLockRelease(&SlotSyncCtx->mutex);
1580 1686 : update_synced_slots_inactive_since();
1581 1686 : return;
1582 : }
1583 :
1584 2 : worker_pid = SlotSyncCtx->pid;
1585 :
1586 2 : SpinLockRelease(&SlotSyncCtx->mutex);
1587 :
1588 2 : if (worker_pid != InvalidPid)
1589 2 : kill(worker_pid, SIGINT);
1590 :
1591 : /* Wait for slot sync to end */
1592 : for (;;)
1593 0 : {
1594 : int rc;
1595 :
1596 : /* Wait a bit, we don't expect to have to wait long */
1597 2 : rc = WaitLatch(MyLatch,
1598 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1599 : 10L, WAIT_EVENT_REPLICATION_SLOTSYNC_SHUTDOWN);
1600 :
1601 2 : if (rc & WL_LATCH_SET)
1602 : {
1603 0 : ResetLatch(MyLatch);
1604 0 : CHECK_FOR_INTERRUPTS();
1605 : }
1606 :
1607 2 : SpinLockAcquire(&SlotSyncCtx->mutex);
1608 :
1609 : /* Ensure that no process is syncing the slots. */
1610 2 : if (!SlotSyncCtx->syncing)
1611 2 : break;
1612 :
1613 0 : SpinLockRelease(&SlotSyncCtx->mutex);
1614 : }
1615 :
1616 2 : SpinLockRelease(&SlotSyncCtx->mutex);
1617 :
1618 2 : update_synced_slots_inactive_since();
1619 : }
1620 :
1621 : /*
1622 : * SlotSyncWorkerCanRestart
1623 : *
1624 : * Returns true if enough time (SLOTSYNC_RESTART_INTERVAL_SEC) has passed
1625 : * since it was launched last. Otherwise returns false.
1626 : *
1627 : * This is a safety valve to protect against continuous respawn attempts if the
1628 : * worker is dying immediately at launch. Note that since we will retry to
1629 : * launch the worker from the postmaster main loop, we will get another
1630 : * chance later.
1631 : */
1632 : bool
1633 10 : SlotSyncWorkerCanRestart(void)
1634 : {
1635 10 : time_t curtime = time(NULL);
1636 :
1637 : /* Return false if too soon since last start. */
1638 10 : if ((unsigned int) (curtime - SlotSyncCtx->last_start_time) <
1639 : (unsigned int) SLOTSYNC_RESTART_INTERVAL_SEC)
1640 2 : return false;
1641 :
1642 8 : SlotSyncCtx->last_start_time = curtime;
1643 :
1644 8 : return true;
1645 : }
1646 :
1647 : /*
1648 : * Is current process syncing replication slots?
1649 : *
1650 : * Could be either backend executing SQL function or slot sync worker.
1651 : */
1652 : bool
1653 42 : IsSyncingReplicationSlots(void)
1654 : {
1655 42 : return syncing_slots;
1656 : }
1657 :
1658 : /*
1659 : * Amount of shared memory required for slot synchronization.
1660 : */
1661 : Size
1662 5964 : SlotSyncShmemSize(void)
1663 : {
1664 5964 : return sizeof(SlotSyncCtxStruct);
1665 : }
1666 :
1667 : /*
1668 : * Allocate and initialize the shared memory of slot synchronization.
1669 : */
1670 : void
1671 2084 : SlotSyncShmemInit(void)
1672 : {
1673 2084 : Size size = SlotSyncShmemSize();
1674 : bool found;
1675 :
1676 2084 : SlotSyncCtx = (SlotSyncCtxStruct *)
1677 2084 : ShmemInitStruct("Slot Sync Data", size, &found);
1678 :
1679 2084 : if (!found)
1680 : {
1681 2084 : memset(SlotSyncCtx, 0, size);
1682 2084 : SlotSyncCtx->pid = InvalidPid;
1683 2084 : SpinLockInit(&SlotSyncCtx->mutex);
1684 : }
1685 2084 : }
1686 :
1687 : /*
1688 : * Error cleanup callback for slot sync SQL function.
1689 : */
1690 : static void
1691 2 : slotsync_failure_callback(int code, Datum arg)
1692 : {
1693 2 : WalReceiverConn *wrconn = (WalReceiverConn *) DatumGetPointer(arg);
1694 :
1695 : /*
1696 : * We need to do slots cleanup here just like WalSndErrorCleanup() does.
1697 : *
1698 : * The startup process during promotion invokes ShutDownSlotSync() which
1699 : * waits for slot sync to finish and it does that by checking the
1700 : * 'syncing' flag. Thus the SQL function must be done with slots' release
1701 : * and cleanup to avoid any dangling temporary slots or active slots
1702 : * before it marks itself as finished syncing.
1703 : */
1704 :
1705 : /* Make sure active replication slots are released */
1706 2 : if (MyReplicationSlot != NULL)
1707 0 : ReplicationSlotRelease();
1708 :
1709 : /* Also cleanup the synced temporary slots. */
1710 2 : ReplicationSlotCleanup(true);
1711 :
1712 : /*
1713 : * The set syncing_slots indicates that the process errored out without
1714 : * resetting the flag. So, we need to clean up shared memory and reset the
1715 : * flag here.
1716 : */
1717 2 : if (syncing_slots)
1718 2 : reset_syncing_flag();
1719 :
1720 2 : walrcv_disconnect(wrconn);
1721 2 : }
1722 :
1723 : /*
1724 : * Synchronize the failover enabled replication slots using the specified
1725 : * primary server connection.
1726 : */
1727 : void
1728 16 : SyncReplicationSlots(WalReceiverConn *wrconn)
1729 : {
1730 16 : PG_ENSURE_ERROR_CLEANUP(slotsync_failure_callback, PointerGetDatum(wrconn));
1731 : {
1732 16 : check_and_set_sync_info(InvalidPid);
1733 :
1734 16 : validate_remote_info(wrconn);
1735 :
1736 14 : synchronize_slots(wrconn);
1737 :
1738 : /* Cleanup the synced temporary slots */
1739 14 : ReplicationSlotCleanup(true);
1740 :
1741 : /* We are done with sync, so reset sync flag */
1742 14 : reset_syncing_flag();
1743 : }
1744 16 : PG_END_ENSURE_ERROR_CLEANUP(slotsync_failure_callback, PointerGetDatum(wrconn));
1745 14 : }
|