Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * slotsync.c
3 : * Functionality for synchronizing slots to a standby server from the
4 : * primary server.
5 : *
6 : * Copyright (c) 2024-2025, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/backend/replication/logical/slotsync.c
10 : *
11 : * This file contains the code for slot synchronization on a physical standby
12 : * to fetch logical failover slots information from the primary server, create
13 : * the slots on the standby and synchronize them periodically.
14 : *
15 : * Slot synchronization can be performed either automatically by enabling slot
16 : * sync worker or manually by calling SQL function pg_sync_replication_slots().
17 : *
18 : * If the WAL corresponding to the remote's restart_lsn is not available on the
19 : * physical standby or the remote's catalog_xmin precedes the oldest xid for
20 : * which it is guaranteed that rows wouldn't have been removed then we cannot
21 : * create the local standby slot because that would mean moving the local slot
22 : * backward and decoding won't be possible via such a slot. In this case, the
23 : * slot will be marked as RS_TEMPORARY. Once the primary server catches up,
24 : * the slot will be marked as RS_PERSISTENT (which means sync-ready) after
25 : * which slot sync worker can perform the sync periodically or user can call
26 : * pg_sync_replication_slots() periodically to perform the syncs.
27 : *
28 : * If synchronized slots fail to build a consistent snapshot from the
29 : * restart_lsn before reaching confirmed_flush_lsn, they would become
30 : * unreliable after promotion due to potential data loss from changes
31 : * before reaching a consistent point. This can happen because the slots can
32 : * be synced at some random time and we may not reach the consistent point
33 : * at the same WAL location as the primary. So, we mark such slots as
34 : * RS_TEMPORARY. Once the decoding from corresponding LSNs can reach a
35 : * consistent point, they will be marked as RS_PERSISTENT.
36 : *
37 : * The slot sync worker waits for some time before the next synchronization,
38 : * with the duration varying based on whether any slots were updated during
39 : * the last cycle. Refer to the comments above wait_for_slot_activity() for
40 : * more details.
41 : *
42 : * Any standby synchronized slots will be dropped if they no longer need
43 : * to be synchronized. See comment atop drop_local_obsolete_slots() for more
44 : * details.
45 : *---------------------------------------------------------------------------
46 : */
47 :
48 : #include "postgres.h"
49 :
50 : #include <time.h>
51 :
52 : #include "access/xlog_internal.h"
53 : #include "access/xlogrecovery.h"
54 : #include "catalog/pg_database.h"
55 : #include "commands/dbcommands.h"
56 : #include "libpq/pqsignal.h"
57 : #include "pgstat.h"
58 : #include "postmaster/interrupt.h"
59 : #include "replication/logical.h"
60 : #include "replication/slotsync.h"
61 : #include "replication/snapbuild.h"
62 : #include "storage/ipc.h"
63 : #include "storage/lmgr.h"
64 : #include "storage/proc.h"
65 : #include "storage/procarray.h"
66 : #include "tcop/tcopprot.h"
67 : #include "utils/builtins.h"
68 : #include "utils/pg_lsn.h"
69 : #include "utils/ps_status.h"
70 : #include "utils/timeout.h"
71 :
72 : /*
73 : * Struct for sharing information to control slot synchronization.
74 : *
75 : * The slot sync worker's pid is needed by the startup process to shut it
76 : * down during promotion. The startup process shuts down the slot sync worker
77 : * and also sets stopSignaled=true to handle the race condition when the
78 : * postmaster has not noticed the promotion yet and thus may end up restarting
79 : * the slot sync worker. If stopSignaled is set, the worker will exit in such a
80 : * case. The SQL function pg_sync_replication_slots() will also error out if
81 : * this flag is set. Note that we don't need to reset this variable as after
82 : * promotion the slot sync worker won't be restarted because the pmState
83 : * changes to PM_RUN from PM_HOT_STANDBY and we don't support demoting
84 : * primary without restarting the server. See LaunchMissingBackgroundProcesses.
85 : *
86 : * The 'syncing' flag is needed to prevent concurrent slot syncs to avoid slot
87 : * overwrites.
88 : *
89 : * The 'last_start_time' is needed by postmaster to start the slot sync worker
90 : * once per SLOTSYNC_RESTART_INTERVAL_SEC. In cases where an immediate restart
91 : * is expected (e.g., slot sync GUCs change), slot sync worker will reset
92 : * last_start_time before exiting, so that postmaster can start the worker
93 : * without waiting for SLOTSYNC_RESTART_INTERVAL_SEC.
94 : */
95 : typedef struct SlotSyncCtxStruct
96 : {
97 : pid_t pid;
98 : bool stopSignaled;
99 : bool syncing;
100 : time_t last_start_time;
101 : slock_t mutex;
102 : } SlotSyncCtxStruct;
103 :
104 : static SlotSyncCtxStruct *SlotSyncCtx = NULL;
105 :
106 : /* GUC variable */
107 : bool sync_replication_slots = false;
108 :
109 : /*
110 : * The sleep time (ms) between slot-sync cycles varies dynamically
111 : * (within a MIN/MAX range) according to slot activity. See
112 : * wait_for_slot_activity() for details.
113 : */
114 : #define MIN_SLOTSYNC_WORKER_NAPTIME_MS 200
115 : #define MAX_SLOTSYNC_WORKER_NAPTIME_MS 30000 /* 30s */
116 :
117 : static long sleep_ms = MIN_SLOTSYNC_WORKER_NAPTIME_MS;
118 :
119 : /* The restart interval for slot sync work used by postmaster */
120 : #define SLOTSYNC_RESTART_INTERVAL_SEC 10
121 :
122 : /*
123 : * Flag to tell if we are syncing replication slots. Unlike the 'syncing' flag
124 : * in SlotSyncCtxStruct, this flag is true only if the current process is
125 : * performing slot synchronization.
126 : */
127 : static bool syncing_slots = false;
128 :
129 : /*
130 : * Structure to hold information fetched from the primary server about a logical
131 : * replication slot.
132 : */
133 : typedef struct RemoteSlot
134 : {
135 : char *name;
136 : char *plugin;
137 : char *database;
138 : bool two_phase;
139 : bool failover;
140 : XLogRecPtr restart_lsn;
141 : XLogRecPtr confirmed_lsn;
142 : TransactionId catalog_xmin;
143 :
144 : /* RS_INVAL_NONE if valid, or the reason of invalidation */
145 : ReplicationSlotInvalidationCause invalidated;
146 : } RemoteSlot;
147 :
148 : static void slotsync_failure_callback(int code, Datum arg);
149 : static void update_synced_slots_inactive_since(void);
150 :
151 : /*
152 : * If necessary, update the local synced slot's metadata based on the data
153 : * from the remote slot.
154 : *
155 : * If no update was needed (the data of the remote slot is the same as the
156 : * local slot) return false, otherwise true.
157 : *
158 : * *found_consistent_snapshot will be true iff the remote slot's LSN or xmin is
159 : * modified, and decoding from the corresponding LSN's can reach a
160 : * consistent snapshot.
161 : *
162 : * *remote_slot_precedes will be true if the remote slot's LSN or xmin
163 : * precedes locally reserved position.
164 : */
165 : static bool
166 60 : update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
167 : bool *found_consistent_snapshot,
168 : bool *remote_slot_precedes)
169 : {
170 60 : ReplicationSlot *slot = MyReplicationSlot;
171 60 : bool updated_xmin_or_lsn = false;
172 60 : bool updated_config = false;
173 :
174 : Assert(slot->data.invalidated == RS_INVAL_NONE);
175 :
176 60 : if (found_consistent_snapshot)
177 10 : *found_consistent_snapshot = false;
178 :
179 60 : if (remote_slot_precedes)
180 10 : *remote_slot_precedes = false;
181 :
182 : /*
183 : * Don't overwrite if we already have a newer catalog_xmin and
184 : * restart_lsn.
185 : */
186 120 : if (remote_slot->restart_lsn < slot->data.restart_lsn ||
187 60 : TransactionIdPrecedes(remote_slot->catalog_xmin,
188 : slot->data.catalog_xmin))
189 : {
190 : /*
191 : * This can happen in following situations:
192 : *
193 : * If the slot is temporary, it means either the initial WAL location
194 : * reserved for the local slot is ahead of the remote slot's
195 : * restart_lsn or the initial xmin_horizon computed for the local slot
196 : * is ahead of the remote slot.
197 : *
198 : * If the slot is persistent, restart_lsn of the synced slot could
199 : * still be ahead of the remote slot. Since we use slot advance
200 : * functionality to keep snapbuild/slot updated, it is possible that
201 : * the restart_lsn is advanced to a later position than it has on the
202 : * primary. This can happen when slot advancing machinery finds
203 : * running xacts record after reaching the consistent state at a later
204 : * point than the primary where it serializes the snapshot and updates
205 : * the restart_lsn.
206 : *
207 : * We LOG the message if the slot is temporary as it can help the user
208 : * to understand why the slot is not sync-ready. In the case of a
209 : * persistent slot, it would be a more common case and won't directly
210 : * impact the users, so we used DEBUG1 level to log the message.
211 : */
212 0 : ereport(slot->data.persistency == RS_TEMPORARY ? LOG : DEBUG1,
213 : errmsg("could not synchronize replication slot \"%s\" because remote slot precedes local slot",
214 : remote_slot->name),
215 : errdetail("The remote slot has LSN %X/%X and catalog xmin %u, but the local slot has LSN %X/%X and catalog xmin %u.",
216 : LSN_FORMAT_ARGS(remote_slot->restart_lsn),
217 : remote_slot->catalog_xmin,
218 : LSN_FORMAT_ARGS(slot->data.restart_lsn),
219 : slot->data.catalog_xmin));
220 :
221 0 : if (remote_slot_precedes)
222 0 : *remote_slot_precedes = true;
223 : }
224 :
225 : /*
226 : * Attempt to sync LSNs and xmins only if remote slot is ahead of local
227 : * slot.
228 : */
229 60 : else if (remote_slot->confirmed_lsn > slot->data.confirmed_flush ||
230 82 : remote_slot->restart_lsn > slot->data.restart_lsn ||
231 40 : TransactionIdFollows(remote_slot->catalog_xmin,
232 : slot->data.catalog_xmin))
233 : {
234 : /*
235 : * We can't directly copy the remote slot's LSN or xmin unless there
236 : * exists a consistent snapshot at that point. Otherwise, after
237 : * promotion, the slots may not reach a consistent point before the
238 : * confirmed_flush_lsn which can lead to a data loss. To avoid data
239 : * loss, we let slot machinery advance the slot which ensures that
240 : * snapbuilder/slot statuses are updated properly.
241 : */
242 20 : if (SnapBuildSnapshotExists(remote_slot->restart_lsn))
243 : {
244 : /*
245 : * Update the slot info directly if there is a serialized snapshot
246 : * at the restart_lsn, as the slot can quickly reach consistency
247 : * at restart_lsn by restoring the snapshot.
248 : */
249 4 : SpinLockAcquire(&slot->mutex);
250 4 : slot->data.restart_lsn = remote_slot->restart_lsn;
251 4 : slot->data.confirmed_flush = remote_slot->confirmed_lsn;
252 4 : slot->data.catalog_xmin = remote_slot->catalog_xmin;
253 4 : SpinLockRelease(&slot->mutex);
254 :
255 4 : if (found_consistent_snapshot)
256 0 : *found_consistent_snapshot = true;
257 : }
258 : else
259 : {
260 16 : LogicalSlotAdvanceAndCheckSnapState(remote_slot->confirmed_lsn,
261 : found_consistent_snapshot);
262 :
263 : /* Sanity check */
264 16 : if (slot->data.confirmed_flush != remote_slot->confirmed_lsn)
265 0 : ereport(ERROR,
266 : errmsg_internal("synchronized confirmed_flush for slot \"%s\" differs from remote slot",
267 : remote_slot->name),
268 : errdetail_internal("Remote slot has LSN %X/%X but local slot has LSN %X/%X.",
269 : LSN_FORMAT_ARGS(remote_slot->confirmed_lsn),
270 : LSN_FORMAT_ARGS(slot->data.confirmed_flush)));
271 : }
272 :
273 20 : updated_xmin_or_lsn = true;
274 : }
275 :
276 60 : if (remote_dbid != slot->data.database ||
277 60 : remote_slot->two_phase != slot->data.two_phase ||
278 60 : remote_slot->failover != slot->data.failover ||
279 60 : strcmp(remote_slot->plugin, NameStr(slot->data.plugin)) != 0)
280 : {
281 : NameData plugin_name;
282 :
283 : /* Avoid expensive operations while holding a spinlock. */
284 0 : namestrcpy(&plugin_name, remote_slot->plugin);
285 :
286 0 : SpinLockAcquire(&slot->mutex);
287 0 : slot->data.plugin = plugin_name;
288 0 : slot->data.database = remote_dbid;
289 0 : slot->data.two_phase = remote_slot->two_phase;
290 0 : slot->data.failover = remote_slot->failover;
291 0 : SpinLockRelease(&slot->mutex);
292 :
293 0 : updated_config = true;
294 : }
295 :
296 : /*
297 : * We have to write the changed xmin to disk *before* we change the
298 : * in-memory value, otherwise after a crash we wouldn't know that some
299 : * catalog tuples might have been removed already.
300 : */
301 60 : if (updated_config || updated_xmin_or_lsn)
302 : {
303 20 : ReplicationSlotMarkDirty();
304 20 : ReplicationSlotSave();
305 : }
306 :
307 : /*
308 : * Now the new xmin is safely on disk, we can let the global value
309 : * advance. We do not take ProcArrayLock or similar since we only advance
310 : * xmin here and there's not much harm done by a concurrent computation
311 : * missing that.
312 : */
313 60 : if (updated_xmin_or_lsn)
314 : {
315 20 : SpinLockAcquire(&slot->mutex);
316 20 : slot->effective_catalog_xmin = remote_slot->catalog_xmin;
317 20 : SpinLockRelease(&slot->mutex);
318 :
319 20 : ReplicationSlotsComputeRequiredXmin(false);
320 20 : ReplicationSlotsComputeRequiredLSN();
321 : }
322 :
323 60 : return updated_config || updated_xmin_or_lsn;
324 : }
325 :
326 : /*
327 : * Get the list of local logical slots that are synchronized from the
328 : * primary server.
329 : */
330 : static List *
331 34 : get_local_synced_slots(void)
332 : {
333 34 : List *local_slots = NIL;
334 :
335 34 : LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
336 :
337 374 : for (int i = 0; i < max_replication_slots; i++)
338 : {
339 340 : ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
340 :
341 : /* Check if it is a synchronized slot */
342 340 : if (s->in_use && s->data.synced)
343 : {
344 : Assert(SlotIsLogical(s));
345 56 : local_slots = lappend(local_slots, s);
346 : }
347 : }
348 :
349 34 : LWLockRelease(ReplicationSlotControlLock);
350 :
351 34 : return local_slots;
352 : }
353 :
354 : /*
355 : * Helper function to check if local_slot is required to be retained.
356 : *
357 : * Return false either if local_slot does not exist in the remote_slots list
358 : * or is invalidated while the corresponding remote slot is still valid,
359 : * otherwise true.
360 : */
361 : static bool
362 56 : local_sync_slot_required(ReplicationSlot *local_slot, List *remote_slots)
363 : {
364 56 : bool remote_exists = false;
365 56 : bool locally_invalidated = false;
366 :
367 138 : foreach_ptr(RemoteSlot, remote_slot, remote_slots)
368 : {
369 80 : if (strcmp(remote_slot->name, NameStr(local_slot->data.name)) == 0)
370 : {
371 54 : remote_exists = true;
372 :
373 : /*
374 : * If remote slot is not invalidated but local slot is marked as
375 : * invalidated, then set locally_invalidated flag.
376 : */
377 54 : SpinLockAcquire(&local_slot->mutex);
378 54 : locally_invalidated =
379 108 : (remote_slot->invalidated == RS_INVAL_NONE) &&
380 54 : (local_slot->data.invalidated != RS_INVAL_NONE);
381 54 : SpinLockRelease(&local_slot->mutex);
382 :
383 54 : break;
384 : }
385 : }
386 :
387 56 : return (remote_exists && !locally_invalidated);
388 : }
389 :
390 : /*
391 : * Drop local obsolete slots.
392 : *
393 : * Drop the local slots that no longer need to be synced i.e. these either do
394 : * not exist on the primary or are no longer enabled for failover.
395 : *
396 : * Additionally, drop any slots that are valid on the primary but got
397 : * invalidated on the standby. This situation may occur due to the following
398 : * reasons:
399 : * - The 'max_slot_wal_keep_size' on the standby is insufficient to retain WAL
400 : * records from the restart_lsn of the slot.
401 : * - 'primary_slot_name' is temporarily reset to null and the physical slot is
402 : * removed.
403 : * These dropped slots will get recreated in next sync-cycle and it is okay to
404 : * drop and recreate such slots as long as these are not consumable on the
405 : * standby (which is the case currently).
406 : *
407 : * Note: Change of 'wal_level' on the primary server to a level lower than
408 : * logical may also result in slot invalidation and removal on the standby.
409 : * This is because such 'wal_level' change is only possible if the logical
410 : * slots are removed on the primary server, so it's expected to see the
411 : * slots being invalidated and removed on the standby too (and re-created
412 : * if they are re-created on the primary server).
413 : */
414 : static void
415 34 : drop_local_obsolete_slots(List *remote_slot_list)
416 : {
417 34 : List *local_slots = get_local_synced_slots();
418 :
419 124 : foreach_ptr(ReplicationSlot, local_slot, local_slots)
420 : {
421 : /* Drop the local slot if it is not required to be retained. */
422 56 : if (!local_sync_slot_required(local_slot, remote_slot_list))
423 : {
424 : bool synced_slot;
425 :
426 : /*
427 : * Use shared lock to prevent a conflict with
428 : * ReplicationSlotsDropDBSlots(), trying to drop the same slot
429 : * during a drop-database operation.
430 : */
431 4 : LockSharedObject(DatabaseRelationId, local_slot->data.database,
432 : 0, AccessShareLock);
433 :
434 : /*
435 : * In the small window between getting the slot to drop and
436 : * locking the database, there is a possibility of a parallel
437 : * database drop by the startup process and the creation of a new
438 : * slot by the user. This new user-created slot may end up using
439 : * the same shared memory as that of 'local_slot'. Thus check if
440 : * local_slot is still the synced one before performing actual
441 : * drop.
442 : */
443 4 : SpinLockAcquire(&local_slot->mutex);
444 4 : synced_slot = local_slot->in_use && local_slot->data.synced;
445 4 : SpinLockRelease(&local_slot->mutex);
446 :
447 4 : if (synced_slot)
448 : {
449 4 : ReplicationSlotAcquire(NameStr(local_slot->data.name), true);
450 4 : ReplicationSlotDropAcquired();
451 : }
452 :
453 4 : UnlockSharedObject(DatabaseRelationId, local_slot->data.database,
454 : 0, AccessShareLock);
455 :
456 4 : ereport(LOG,
457 : errmsg("dropped replication slot \"%s\" of database with OID %u",
458 : NameStr(local_slot->data.name),
459 : local_slot->data.database));
460 : }
461 : }
462 34 : }
463 :
464 : /*
465 : * Reserve WAL for the currently active local slot using the specified WAL
466 : * location (restart_lsn).
467 : *
468 : * If the given WAL location has been removed, reserve WAL using the oldest
469 : * existing WAL segment.
470 : */
471 : static void
472 10 : reserve_wal_for_local_slot(XLogRecPtr restart_lsn)
473 : {
474 : XLogSegNo oldest_segno;
475 : XLogSegNo segno;
476 10 : ReplicationSlot *slot = MyReplicationSlot;
477 :
478 : Assert(slot != NULL);
479 : Assert(XLogRecPtrIsInvalid(slot->data.restart_lsn));
480 :
481 : while (true)
482 : {
483 10 : SpinLockAcquire(&slot->mutex);
484 10 : slot->data.restart_lsn = restart_lsn;
485 10 : SpinLockRelease(&slot->mutex);
486 :
487 : /* Prevent WAL removal as fast as possible */
488 10 : ReplicationSlotsComputeRequiredLSN();
489 :
490 10 : XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size);
491 :
492 : /*
493 : * Find the oldest existing WAL segment file.
494 : *
495 : * Normally, we can determine it by using the last removed segment
496 : * number. However, if no WAL segment files have been removed by a
497 : * checkpoint since startup, we need to search for the oldest segment
498 : * file from the current timeline existing in XLOGDIR.
499 : *
500 : * XXX: Currently, we are searching for the oldest segment in the
501 : * current timeline as there is less chance of the slot's restart_lsn
502 : * from being some prior timeline, and even if it happens, in the
503 : * worst case, we will wait to sync till the slot's restart_lsn moved
504 : * to the current timeline.
505 : */
506 10 : oldest_segno = XLogGetLastRemovedSegno() + 1;
507 :
508 10 : if (oldest_segno == 1)
509 : {
510 : TimeLineID cur_timeline;
511 :
512 6 : GetWalRcvFlushRecPtr(NULL, &cur_timeline);
513 6 : oldest_segno = XLogGetOldestSegno(cur_timeline);
514 : }
515 :
516 10 : elog(DEBUG1, "segno: " UINT64_FORMAT " of purposed restart_lsn for the synced slot, oldest_segno: " UINT64_FORMAT " available",
517 : segno, oldest_segno);
518 :
519 : /*
520 : * If all required WAL is still there, great, otherwise retry. The
521 : * slot should prevent further removal of WAL, unless there's a
522 : * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
523 : * the new restart_lsn above, so normally we should never need to loop
524 : * more than twice.
525 : */
526 10 : if (segno >= oldest_segno)
527 10 : break;
528 :
529 : /* Retry using the location of the oldest wal segment */
530 0 : XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size, restart_lsn);
531 : }
532 10 : }
533 :
534 : /*
535 : * If the remote restart_lsn and catalog_xmin have caught up with the
536 : * local ones, then update the LSNs and persist the local synced slot for
537 : * future synchronization; otherwise, do nothing.
538 : *
539 : * Return true if the slot is marked as RS_PERSISTENT (sync-ready), otherwise
540 : * false.
541 : */
542 : static bool
543 10 : update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid)
544 : {
545 10 : ReplicationSlot *slot = MyReplicationSlot;
546 10 : bool found_consistent_snapshot = false;
547 10 : bool remote_slot_precedes = false;
548 :
549 10 : (void) update_local_synced_slot(remote_slot, remote_dbid,
550 : &found_consistent_snapshot,
551 : &remote_slot_precedes);
552 :
553 : /*
554 : * Check if the primary server has caught up. Refer to the comment atop
555 : * the file for details on this check.
556 : */
557 10 : if (remote_slot_precedes)
558 : {
559 : /*
560 : * The remote slot didn't catch up to locally reserved position.
561 : *
562 : * We do not drop the slot because the restart_lsn can be ahead of the
563 : * current location when recreating the slot in the next cycle. It may
564 : * take more time to create such a slot. Therefore, we keep this slot
565 : * and attempt the synchronization in the next cycle.
566 : */
567 0 : return false;
568 : }
569 :
570 : /*
571 : * Don't persist the slot if it cannot reach the consistent point from the
572 : * restart_lsn. See comments atop this file.
573 : */
574 10 : if (!found_consistent_snapshot)
575 : {
576 0 : ereport(LOG,
577 : errmsg("could not synchronize replication slot \"%s\"", remote_slot->name),
578 : errdetail("Logical decoding could not find consistent point from local slot's LSN %X/%X.",
579 : LSN_FORMAT_ARGS(slot->data.restart_lsn)));
580 :
581 0 : return false;
582 : }
583 :
584 10 : ReplicationSlotPersist();
585 :
586 10 : ereport(LOG,
587 : errmsg("newly created replication slot \"%s\" is sync-ready now",
588 : remote_slot->name));
589 :
590 10 : return true;
591 : }
592 :
593 : /*
594 : * Synchronize a single slot to the given position.
595 : *
596 : * This creates a new slot if there is no existing one and updates the
597 : * metadata of the slot as per the data received from the primary server.
598 : *
599 : * The slot is created as a temporary slot and stays in the same state until the
600 : * remote_slot catches up with locally reserved position and local slot is
601 : * updated. The slot is then persisted and is considered as sync-ready for
602 : * periodic syncs.
603 : *
604 : * Returns TRUE if the local slot is updated.
605 : */
606 : static bool
607 62 : synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid)
608 : {
609 : ReplicationSlot *slot;
610 : XLogRecPtr latestFlushPtr;
611 62 : bool slot_updated = false;
612 :
613 : /*
614 : * Make sure that concerned WAL is received and flushed before syncing
615 : * slot to target lsn received from the primary server.
616 : */
617 62 : latestFlushPtr = GetStandbyFlushRecPtr(NULL);
618 62 : if (remote_slot->confirmed_lsn > latestFlushPtr)
619 : {
620 : /*
621 : * Can get here only if GUC 'synchronized_standby_slots' on the
622 : * primary server was not configured correctly.
623 : */
624 2 : ereport(AmLogicalSlotSyncWorkerProcess() ? LOG : ERROR,
625 : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
626 : errmsg("skipping slot synchronization because the received slot sync"
627 : " LSN %X/%X for slot \"%s\" is ahead of the standby position %X/%X",
628 : LSN_FORMAT_ARGS(remote_slot->confirmed_lsn),
629 : remote_slot->name,
630 : LSN_FORMAT_ARGS(latestFlushPtr)));
631 :
632 2 : return false;
633 : }
634 :
635 : /* Search for the named slot */
636 60 : if ((slot = SearchNamedReplicationSlot(remote_slot->name, true)))
637 : {
638 : bool synced;
639 :
640 50 : SpinLockAcquire(&slot->mutex);
641 50 : synced = slot->data.synced;
642 50 : SpinLockRelease(&slot->mutex);
643 :
644 : /* User-created slot with the same name exists, raise ERROR. */
645 50 : if (!synced)
646 0 : ereport(ERROR,
647 : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
648 : errmsg("exiting from slot synchronization because same"
649 : " name slot \"%s\" already exists on the standby",
650 : remote_slot->name));
651 :
652 : /*
653 : * The slot has been synchronized before.
654 : *
655 : * It is important to acquire the slot here before checking
656 : * invalidation. If we don't acquire the slot first, there could be a
657 : * race condition that the local slot could be invalidated just after
658 : * checking the 'invalidated' flag here and we could end up
659 : * overwriting 'invalidated' flag to remote_slot's value. See
660 : * InvalidatePossiblyObsoleteSlot() where it invalidates slot directly
661 : * if the slot is not acquired by other processes.
662 : *
663 : * XXX: If it ever turns out that slot acquire/release is costly for
664 : * cases when none of the slot properties is changed then we can do a
665 : * pre-check to ensure that at least one of the slot properties is
666 : * changed before acquiring the slot.
667 : */
668 50 : ReplicationSlotAcquire(remote_slot->name, true);
669 :
670 : Assert(slot == MyReplicationSlot);
671 :
672 : /*
673 : * Copy the invalidation cause from remote only if local slot is not
674 : * invalidated locally, we don't want to overwrite existing one.
675 : */
676 50 : if (slot->data.invalidated == RS_INVAL_NONE &&
677 50 : remote_slot->invalidated != RS_INVAL_NONE)
678 : {
679 0 : SpinLockAcquire(&slot->mutex);
680 0 : slot->data.invalidated = remote_slot->invalidated;
681 0 : SpinLockRelease(&slot->mutex);
682 :
683 : /* Make sure the invalidated state persists across server restart */
684 0 : ReplicationSlotMarkDirty();
685 0 : ReplicationSlotSave();
686 :
687 0 : slot_updated = true;
688 : }
689 :
690 : /* Skip the sync of an invalidated slot */
691 50 : if (slot->data.invalidated != RS_INVAL_NONE)
692 : {
693 0 : ReplicationSlotRelease();
694 0 : return slot_updated;
695 : }
696 :
697 : /* Slot not ready yet, let's attempt to make it sync-ready now. */
698 50 : if (slot->data.persistency == RS_TEMPORARY)
699 : {
700 0 : slot_updated = update_and_persist_local_synced_slot(remote_slot,
701 : remote_dbid);
702 : }
703 :
704 : /* Slot ready for sync, so sync it. */
705 : else
706 : {
707 : /*
708 : * Sanity check: As long as the invalidations are handled
709 : * appropriately as above, this should never happen.
710 : *
711 : * We don't need to check restart_lsn here. See the comments in
712 : * update_local_synced_slot() for details.
713 : */
714 50 : if (remote_slot->confirmed_lsn < slot->data.confirmed_flush)
715 0 : ereport(ERROR,
716 : errmsg_internal("cannot synchronize local slot \"%s\"",
717 : remote_slot->name),
718 : errdetail_internal("Local slot's start streaming location LSN(%X/%X) is ahead of remote slot's LSN(%X/%X).",
719 : LSN_FORMAT_ARGS(slot->data.confirmed_flush),
720 : LSN_FORMAT_ARGS(remote_slot->confirmed_lsn)));
721 :
722 50 : slot_updated = update_local_synced_slot(remote_slot, remote_dbid,
723 : NULL, NULL);
724 : }
725 : }
726 : /* Otherwise create the slot first. */
727 : else
728 : {
729 : NameData plugin_name;
730 10 : TransactionId xmin_horizon = InvalidTransactionId;
731 :
732 : /* Skip creating the local slot if remote_slot is invalidated already */
733 10 : if (remote_slot->invalidated != RS_INVAL_NONE)
734 0 : return false;
735 :
736 : /*
737 : * We create temporary slots instead of ephemeral slots here because
738 : * we want the slots to survive after releasing them. This is done to
739 : * avoid dropping and re-creating the slots in each synchronization
740 : * cycle if the restart_lsn or catalog_xmin of the remote slot has not
741 : * caught up.
742 : */
743 10 : ReplicationSlotCreate(remote_slot->name, true, RS_TEMPORARY,
744 10 : remote_slot->two_phase,
745 10 : remote_slot->failover,
746 : true);
747 :
748 : /* For shorter lines. */
749 10 : slot = MyReplicationSlot;
750 :
751 : /* Avoid expensive operations while holding a spinlock. */
752 10 : namestrcpy(&plugin_name, remote_slot->plugin);
753 :
754 10 : SpinLockAcquire(&slot->mutex);
755 10 : slot->data.database = remote_dbid;
756 10 : slot->data.plugin = plugin_name;
757 10 : SpinLockRelease(&slot->mutex);
758 :
759 10 : reserve_wal_for_local_slot(remote_slot->restart_lsn);
760 :
761 10 : LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
762 10 : xmin_horizon = GetOldestSafeDecodingTransactionId(true);
763 10 : SpinLockAcquire(&slot->mutex);
764 10 : slot->effective_catalog_xmin = xmin_horizon;
765 10 : slot->data.catalog_xmin = xmin_horizon;
766 10 : SpinLockRelease(&slot->mutex);
767 10 : ReplicationSlotsComputeRequiredXmin(true);
768 10 : LWLockRelease(ProcArrayLock);
769 :
770 10 : update_and_persist_local_synced_slot(remote_slot, remote_dbid);
771 :
772 10 : slot_updated = true;
773 : }
774 :
775 60 : ReplicationSlotRelease();
776 :
777 60 : return slot_updated;
778 : }
779 :
780 : /*
781 : * Synchronize slots.
782 : *
783 : * Gets the failover logical slots info from the primary server and updates
784 : * the slots locally. Creates the slots if not present on the standby.
785 : *
786 : * Returns TRUE if any of the slots gets updated in this sync-cycle.
787 : */
788 : static bool
789 34 : synchronize_slots(WalReceiverConn *wrconn)
790 : {
791 : #define SLOTSYNC_COLUMN_COUNT 9
792 34 : Oid slotRow[SLOTSYNC_COLUMN_COUNT] = {TEXTOID, TEXTOID, LSNOID,
793 : LSNOID, XIDOID, BOOLOID, BOOLOID, TEXTOID, TEXTOID};
794 :
795 : WalRcvExecResult *res;
796 : TupleTableSlot *tupslot;
797 34 : List *remote_slot_list = NIL;
798 34 : bool some_slot_updated = false;
799 34 : bool started_tx = false;
800 34 : const char *query = "SELECT slot_name, plugin, confirmed_flush_lsn,"
801 : " restart_lsn, catalog_xmin, two_phase, failover,"
802 : " database, invalidation_reason"
803 : " FROM pg_catalog.pg_replication_slots"
804 : " WHERE failover and NOT temporary";
805 :
806 : /* The syscache access in walrcv_exec() needs a transaction env. */
807 34 : if (!IsTransactionState())
808 : {
809 20 : StartTransactionCommand();
810 20 : started_tx = true;
811 : }
812 :
813 : /* Execute the query */
814 34 : res = walrcv_exec(wrconn, query, SLOTSYNC_COLUMN_COUNT, slotRow);
815 34 : if (res->status != WALRCV_OK_TUPLES)
816 0 : ereport(ERROR,
817 : errmsg("could not fetch failover logical slots info from the primary server: %s",
818 : res->err));
819 :
820 : /* Construct the remote_slot tuple and synchronize each slot locally */
821 34 : tupslot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple);
822 96 : while (tuplestore_gettupleslot(res->tuplestore, true, false, tupslot))
823 : {
824 : bool isnull;
825 62 : RemoteSlot *remote_slot = palloc0(sizeof(RemoteSlot));
826 : Datum d;
827 62 : int col = 0;
828 :
829 62 : remote_slot->name = TextDatumGetCString(slot_getattr(tupslot, ++col,
830 : &isnull));
831 : Assert(!isnull);
832 :
833 62 : remote_slot->plugin = TextDatumGetCString(slot_getattr(tupslot, ++col,
834 : &isnull));
835 : Assert(!isnull);
836 :
837 : /*
838 : * It is possible to get null values for LSN and Xmin if slot is
839 : * invalidated on the primary server, so handle accordingly.
840 : */
841 62 : d = slot_getattr(tupslot, ++col, &isnull);
842 62 : remote_slot->confirmed_lsn = isnull ? InvalidXLogRecPtr :
843 62 : DatumGetLSN(d);
844 :
845 62 : d = slot_getattr(tupslot, ++col, &isnull);
846 62 : remote_slot->restart_lsn = isnull ? InvalidXLogRecPtr : DatumGetLSN(d);
847 :
848 62 : d = slot_getattr(tupslot, ++col, &isnull);
849 62 : remote_slot->catalog_xmin = isnull ? InvalidTransactionId :
850 62 : DatumGetTransactionId(d);
851 :
852 62 : remote_slot->two_phase = DatumGetBool(slot_getattr(tupslot, ++col,
853 : &isnull));
854 : Assert(!isnull);
855 :
856 62 : remote_slot->failover = DatumGetBool(slot_getattr(tupslot, ++col,
857 : &isnull));
858 : Assert(!isnull);
859 :
860 62 : remote_slot->database = TextDatumGetCString(slot_getattr(tupslot,
861 : ++col, &isnull));
862 : Assert(!isnull);
863 :
864 62 : d = slot_getattr(tupslot, ++col, &isnull);
865 62 : remote_slot->invalidated = isnull ? RS_INVAL_NONE :
866 0 : GetSlotInvalidationCause(TextDatumGetCString(d));
867 :
868 : /* Sanity check */
869 : Assert(col == SLOTSYNC_COLUMN_COUNT);
870 :
871 : /*
872 : * If restart_lsn, confirmed_lsn or catalog_xmin is invalid but the
873 : * slot is valid, that means we have fetched the remote_slot in its
874 : * RS_EPHEMERAL state. In such a case, don't sync it; we can always
875 : * sync it in the next sync cycle when the remote_slot is persisted
876 : * and has valid lsn(s) and xmin values.
877 : *
878 : * XXX: In future, if we plan to expose 'slot->data.persistency' in
879 : * pg_replication_slots view, then we can avoid fetching RS_EPHEMERAL
880 : * slots in the first place.
881 : */
882 62 : if ((XLogRecPtrIsInvalid(remote_slot->restart_lsn) ||
883 62 : XLogRecPtrIsInvalid(remote_slot->confirmed_lsn) ||
884 62 : !TransactionIdIsValid(remote_slot->catalog_xmin)) &&
885 0 : remote_slot->invalidated == RS_INVAL_NONE)
886 0 : pfree(remote_slot);
887 : else
888 : /* Create list of remote slots */
889 62 : remote_slot_list = lappend(remote_slot_list, remote_slot);
890 :
891 62 : ExecClearTuple(tupslot);
892 : }
893 :
894 : /* Drop local slots that no longer need to be synced. */
895 34 : drop_local_obsolete_slots(remote_slot_list);
896 :
897 : /* Now sync the slots locally */
898 130 : foreach_ptr(RemoteSlot, remote_slot, remote_slot_list)
899 : {
900 62 : Oid remote_dbid = get_database_oid(remote_slot->database, false);
901 :
902 : /*
903 : * Use shared lock to prevent a conflict with
904 : * ReplicationSlotsDropDBSlots(), trying to drop the same slot during
905 : * a drop-database operation.
906 : */
907 62 : LockSharedObject(DatabaseRelationId, remote_dbid, 0, AccessShareLock);
908 :
909 62 : some_slot_updated |= synchronize_one_slot(remote_slot, remote_dbid);
910 :
911 62 : UnlockSharedObject(DatabaseRelationId, remote_dbid, 0, AccessShareLock);
912 : }
913 :
914 : /* We are done, free remote_slot_list elements */
915 34 : list_free_deep(remote_slot_list);
916 :
917 34 : walrcv_clear_result(res);
918 :
919 34 : if (started_tx)
920 20 : CommitTransactionCommand();
921 :
922 34 : return some_slot_updated;
923 : }
924 :
925 : /*
926 : * Checks the remote server info.
927 : *
928 : * We ensure that the 'primary_slot_name' exists on the remote server and the
929 : * remote server is not a standby node.
930 : */
931 : static void
932 24 : validate_remote_info(WalReceiverConn *wrconn)
933 : {
934 : #define PRIMARY_INFO_OUTPUT_COL_COUNT 2
935 : WalRcvExecResult *res;
936 24 : Oid slotRow[PRIMARY_INFO_OUTPUT_COL_COUNT] = {BOOLOID, BOOLOID};
937 : StringInfoData cmd;
938 : bool isnull;
939 : TupleTableSlot *tupslot;
940 : bool remote_in_recovery;
941 : bool primary_slot_valid;
942 24 : bool started_tx = false;
943 :
944 24 : initStringInfo(&cmd);
945 24 : appendStringInfo(&cmd,
946 : "SELECT pg_is_in_recovery(), count(*) = 1"
947 : " FROM pg_catalog.pg_replication_slots"
948 : " WHERE slot_type='physical' AND slot_name=%s",
949 : quote_literal_cstr(PrimarySlotName));
950 :
951 : /* The syscache access in walrcv_exec() needs a transaction env. */
952 24 : if (!IsTransactionState())
953 : {
954 8 : StartTransactionCommand();
955 8 : started_tx = true;
956 : }
957 :
958 24 : res = walrcv_exec(wrconn, cmd.data, PRIMARY_INFO_OUTPUT_COL_COUNT, slotRow);
959 24 : pfree(cmd.data);
960 :
961 24 : if (res->status != WALRCV_OK_TUPLES)
962 0 : ereport(ERROR,
963 : errmsg("could not fetch primary slot name \"%s\" info from the primary server: %s",
964 : PrimarySlotName, res->err),
965 : errhint("Check if \"primary_slot_name\" is configured correctly."));
966 :
967 24 : tupslot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple);
968 24 : if (!tuplestore_gettupleslot(res->tuplestore, true, false, tupslot))
969 0 : elog(ERROR,
970 : "failed to fetch tuple for the primary server slot specified by \"primary_slot_name\"");
971 :
972 24 : remote_in_recovery = DatumGetBool(slot_getattr(tupslot, 1, &isnull));
973 : Assert(!isnull);
974 :
975 : /*
976 : * Slot sync is currently not supported on a cascading standby. This is
977 : * because if we allow it, the primary server needs to wait for all the
978 : * cascading standbys, otherwise, logical subscribers can still be ahead
979 : * of one of the cascading standbys which we plan to promote. Thus, to
980 : * avoid this additional complexity, we restrict it for the time being.
981 : */
982 24 : if (remote_in_recovery)
983 2 : ereport(ERROR,
984 : errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
985 : errmsg("cannot synchronize replication slots from a standby server"));
986 :
987 22 : primary_slot_valid = DatumGetBool(slot_getattr(tupslot, 2, &isnull));
988 : Assert(!isnull);
989 :
990 22 : if (!primary_slot_valid)
991 0 : ereport(ERROR,
992 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
993 : /* translator: second %s is a GUC variable name */
994 : errmsg("replication slot \"%s\" specified by \"%s\" does not exist on primary server",
995 : PrimarySlotName, "primary_slot_name"));
996 :
997 22 : ExecClearTuple(tupslot);
998 22 : walrcv_clear_result(res);
999 :
1000 22 : if (started_tx)
1001 8 : CommitTransactionCommand();
1002 22 : }
1003 :
1004 : /*
1005 : * Checks if dbname is specified in 'primary_conninfo'.
1006 : *
1007 : * Error out if not specified otherwise return it.
1008 : */
1009 : char *
1010 26 : CheckAndGetDbnameFromConninfo(void)
1011 : {
1012 : char *dbname;
1013 :
1014 : /*
1015 : * The slot synchronization needs a database connection for walrcv_exec to
1016 : * work.
1017 : */
1018 26 : dbname = walrcv_get_dbname_from_conninfo(PrimaryConnInfo);
1019 26 : if (dbname == NULL)
1020 2 : ereport(ERROR,
1021 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1022 :
1023 : /*
1024 : * translator: first %s is a connection option; second %s is a GUC
1025 : * variable name
1026 : */
1027 : errmsg("replication slot synchronization requires \"%s\" to be specified in \"%s\"",
1028 : "dbname", "primary_conninfo"));
1029 24 : return dbname;
1030 : }
1031 :
1032 : /*
1033 : * Return true if all necessary GUCs for slot synchronization are set
1034 : * appropriately, otherwise, return false.
1035 : */
1036 : bool
1037 30 : ValidateSlotSyncParams(int elevel)
1038 : {
1039 : /*
1040 : * Logical slot sync/creation requires wal_level >= logical.
1041 : *
1042 : * Since altering the wal_level requires a server restart, so error out in
1043 : * this case regardless of elevel provided by caller.
1044 : */
1045 30 : if (wal_level < WAL_LEVEL_LOGICAL)
1046 0 : ereport(ERROR,
1047 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1048 : errmsg("replication slot synchronization requires \"wal_level\" >= \"logical\""));
1049 :
1050 : /*
1051 : * A physical replication slot(primary_slot_name) is required on the
1052 : * primary to ensure that the rows needed by the standby are not removed
1053 : * after restarting, so that the synchronized slot on the standby will not
1054 : * be invalidated.
1055 : */
1056 30 : if (PrimarySlotName == NULL || *PrimarySlotName == '\0')
1057 : {
1058 0 : ereport(elevel,
1059 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1060 : /* translator: %s is a GUC variable name */
1061 : errmsg("replication slot synchronization requires \"%s\" to be set", "primary_slot_name"));
1062 0 : return false;
1063 : }
1064 :
1065 : /*
1066 : * hot_standby_feedback must be enabled to cooperate with the physical
1067 : * replication slot, which allows informing the primary about the xmin and
1068 : * catalog_xmin values on the standby.
1069 : */
1070 30 : if (!hot_standby_feedback)
1071 : {
1072 2 : ereport(elevel,
1073 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1074 : /* translator: %s is a GUC variable name */
1075 : errmsg("replication slot synchronization requires \"%s\" to be enabled",
1076 : "hot_standby_feedback"));
1077 2 : return false;
1078 : }
1079 :
1080 : /*
1081 : * The primary_conninfo is required to make connection to primary for
1082 : * getting slots information.
1083 : */
1084 28 : if (PrimaryConnInfo == NULL || *PrimaryConnInfo == '\0')
1085 : {
1086 0 : ereport(elevel,
1087 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1088 : /* translator: %s is a GUC variable name */
1089 : errmsg("replication slot synchronization requires \"%s\" to be set",
1090 : "primary_conninfo"));
1091 0 : return false;
1092 : }
1093 :
1094 28 : return true;
1095 : }
1096 :
1097 : /*
1098 : * Re-read the config file.
1099 : *
1100 : * Exit if any of the slot sync GUCs have changed. The postmaster will
1101 : * restart it.
1102 : */
1103 : static void
1104 2 : slotsync_reread_config(void)
1105 : {
1106 2 : char *old_primary_conninfo = pstrdup(PrimaryConnInfo);
1107 2 : char *old_primary_slotname = pstrdup(PrimarySlotName);
1108 2 : bool old_sync_replication_slots = sync_replication_slots;
1109 2 : bool old_hot_standby_feedback = hot_standby_feedback;
1110 : bool conninfo_changed;
1111 : bool primary_slotname_changed;
1112 :
1113 : Assert(sync_replication_slots);
1114 :
1115 2 : ConfigReloadPending = false;
1116 2 : ProcessConfigFile(PGC_SIGHUP);
1117 :
1118 2 : conninfo_changed = strcmp(old_primary_conninfo, PrimaryConnInfo) != 0;
1119 2 : primary_slotname_changed = strcmp(old_primary_slotname, PrimarySlotName) != 0;
1120 2 : pfree(old_primary_conninfo);
1121 2 : pfree(old_primary_slotname);
1122 :
1123 2 : if (old_sync_replication_slots != sync_replication_slots)
1124 : {
1125 0 : ereport(LOG,
1126 : /* translator: %s is a GUC variable name */
1127 : errmsg("replication slot synchronization worker will shut down because \"%s\" is disabled", "sync_replication_slots"));
1128 0 : proc_exit(0);
1129 : }
1130 :
1131 2 : if (conninfo_changed ||
1132 2 : primary_slotname_changed ||
1133 2 : (old_hot_standby_feedback != hot_standby_feedback))
1134 : {
1135 2 : ereport(LOG,
1136 : errmsg("replication slot synchronization worker will restart because of a parameter change"));
1137 :
1138 : /*
1139 : * Reset the last-start time for this worker so that the postmaster
1140 : * can restart it without waiting for SLOTSYNC_RESTART_INTERVAL_SEC.
1141 : */
1142 2 : SlotSyncCtx->last_start_time = 0;
1143 :
1144 2 : proc_exit(0);
1145 : }
1146 :
1147 0 : }
1148 :
1149 : /*
1150 : * Interrupt handler for main loop of slot sync worker.
1151 : */
1152 : static void
1153 28 : ProcessSlotSyncInterrupts(WalReceiverConn *wrconn)
1154 : {
1155 28 : CHECK_FOR_INTERRUPTS();
1156 :
1157 24 : if (ShutdownRequestPending)
1158 : {
1159 2 : ereport(LOG,
1160 : errmsg("replication slot synchronization worker is shutting down on receiving SIGINT"));
1161 :
1162 2 : proc_exit(0);
1163 : }
1164 :
1165 22 : if (ConfigReloadPending)
1166 2 : slotsync_reread_config();
1167 20 : }
1168 :
1169 : /*
1170 : * Connection cleanup function for slotsync worker.
1171 : *
1172 : * Called on slotsync worker exit.
1173 : */
1174 : static void
1175 8 : slotsync_worker_disconnect(int code, Datum arg)
1176 : {
1177 8 : WalReceiverConn *wrconn = (WalReceiverConn *) DatumGetPointer(arg);
1178 :
1179 8 : walrcv_disconnect(wrconn);
1180 8 : }
1181 :
1182 : /*
1183 : * Cleanup function for slotsync worker.
1184 : *
1185 : * Called on slotsync worker exit.
1186 : */
1187 : static void
1188 8 : slotsync_worker_onexit(int code, Datum arg)
1189 : {
1190 : /*
1191 : * We need to do slots cleanup here just like WalSndErrorCleanup() does.
1192 : *
1193 : * The startup process during promotion invokes ShutDownSlotSync() which
1194 : * waits for slot sync to finish and it does that by checking the
1195 : * 'syncing' flag. Thus the slot sync worker must be done with slots'
1196 : * release and cleanup to avoid any dangling temporary slots or active
1197 : * slots before it marks itself as finished syncing.
1198 : */
1199 :
1200 : /* Make sure active replication slots are released */
1201 8 : if (MyReplicationSlot != NULL)
1202 0 : ReplicationSlotRelease();
1203 :
1204 : /* Also cleanup the temporary slots. */
1205 8 : ReplicationSlotCleanup(false);
1206 :
1207 8 : SpinLockAcquire(&SlotSyncCtx->mutex);
1208 :
1209 8 : SlotSyncCtx->pid = InvalidPid;
1210 :
1211 : /*
1212 : * If syncing_slots is true, it indicates that the process errored out
1213 : * without resetting the flag. So, we need to clean up shared memory and
1214 : * reset the flag here.
1215 : */
1216 8 : if (syncing_slots)
1217 : {
1218 8 : SlotSyncCtx->syncing = false;
1219 8 : syncing_slots = false;
1220 : }
1221 :
1222 8 : SpinLockRelease(&SlotSyncCtx->mutex);
1223 8 : }
1224 :
1225 : /*
1226 : * Sleep for long enough that we believe it's likely that the slots on primary
1227 : * get updated.
1228 : *
1229 : * If there is no slot activity the wait time between sync-cycles will double
1230 : * (to a maximum of 30s). If there is some slot activity the wait time between
1231 : * sync-cycles is reset to the minimum (200ms).
1232 : */
1233 : static void
1234 20 : wait_for_slot_activity(bool some_slot_updated)
1235 : {
1236 : int rc;
1237 :
1238 20 : if (!some_slot_updated)
1239 : {
1240 : /*
1241 : * No slots were updated, so double the sleep time, but not beyond the
1242 : * maximum allowable value.
1243 : */
1244 12 : sleep_ms = Min(sleep_ms * 2, MAX_SLOTSYNC_WORKER_NAPTIME_MS);
1245 : }
1246 : else
1247 : {
1248 : /*
1249 : * Some slots were updated since the last sleep, so reset the sleep
1250 : * time.
1251 : */
1252 8 : sleep_ms = MIN_SLOTSYNC_WORKER_NAPTIME_MS;
1253 : }
1254 :
1255 20 : rc = WaitLatch(MyLatch,
1256 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1257 : sleep_ms,
1258 : WAIT_EVENT_REPLICATION_SLOTSYNC_MAIN);
1259 :
1260 20 : if (rc & WL_LATCH_SET)
1261 8 : ResetLatch(MyLatch);
1262 20 : }
1263 :
1264 : /*
1265 : * Emit an error if a promotion or a concurrent sync call is in progress.
1266 : * Otherwise, advertise that a sync is in progress.
1267 : */
1268 : static void
1269 24 : check_and_set_sync_info(pid_t worker_pid)
1270 : {
1271 24 : SpinLockAcquire(&SlotSyncCtx->mutex);
1272 :
1273 : /* The worker pid must not be already assigned in SlotSyncCtx */
1274 : Assert(worker_pid == InvalidPid || SlotSyncCtx->pid == InvalidPid);
1275 :
1276 : /*
1277 : * Emit an error if startup process signaled the slot sync machinery to
1278 : * stop. See comments atop SlotSyncCtxStruct.
1279 : */
1280 24 : if (SlotSyncCtx->stopSignaled)
1281 : {
1282 0 : SpinLockRelease(&SlotSyncCtx->mutex);
1283 0 : ereport(ERROR,
1284 : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1285 : errmsg("cannot synchronize replication slots when standby promotion is ongoing"));
1286 : }
1287 :
1288 24 : if (SlotSyncCtx->syncing)
1289 : {
1290 0 : SpinLockRelease(&SlotSyncCtx->mutex);
1291 0 : ereport(ERROR,
1292 : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1293 : errmsg("cannot synchronize replication slots concurrently"));
1294 : }
1295 :
1296 24 : SlotSyncCtx->syncing = true;
1297 :
1298 : /*
1299 : * Advertise the required PID so that the startup process can kill the
1300 : * slot sync worker on promotion.
1301 : */
1302 24 : SlotSyncCtx->pid = worker_pid;
1303 :
1304 24 : SpinLockRelease(&SlotSyncCtx->mutex);
1305 :
1306 24 : syncing_slots = true;
1307 24 : }
1308 :
1309 : /*
1310 : * Reset syncing flag.
1311 : */
1312 : static void
1313 16 : reset_syncing_flag()
1314 : {
1315 16 : SpinLockAcquire(&SlotSyncCtx->mutex);
1316 16 : SlotSyncCtx->syncing = false;
1317 16 : SpinLockRelease(&SlotSyncCtx->mutex);
1318 :
1319 16 : syncing_slots = false;
1320 16 : };
1321 :
1322 : /*
1323 : * The main loop of our worker process.
1324 : *
1325 : * It connects to the primary server, fetches logical failover slots
1326 : * information periodically in order to create and sync the slots.
1327 : */
1328 : void
1329 8 : ReplSlotSyncWorkerMain(char *startup_data, size_t startup_data_len)
1330 : {
1331 8 : WalReceiverConn *wrconn = NULL;
1332 : char *dbname;
1333 : char *err;
1334 : sigjmp_buf local_sigjmp_buf;
1335 : StringInfoData app_name;
1336 :
1337 : Assert(startup_data_len == 0);
1338 :
1339 8 : MyBackendType = B_SLOTSYNC_WORKER;
1340 :
1341 8 : init_ps_display(NULL);
1342 :
1343 : Assert(GetProcessingMode() == InitProcessing);
1344 :
1345 : /*
1346 : * Create a per-backend PGPROC struct in shared memory. We must do this
1347 : * before we access any shared memory.
1348 : */
1349 8 : InitProcess();
1350 :
1351 : /*
1352 : * Early initialization.
1353 : */
1354 8 : BaseInit();
1355 :
1356 : Assert(SlotSyncCtx != NULL);
1357 :
1358 : /*
1359 : * If an exception is encountered, processing resumes here.
1360 : *
1361 : * We just need to clean up, report the error, and go away.
1362 : *
1363 : * If we do not have this handling here, then since this worker process
1364 : * operates at the bottom of the exception stack, ERRORs turn into FATALs.
1365 : * Therefore, we create our own exception handler to catch ERRORs.
1366 : */
1367 8 : if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1368 : {
1369 : /* since not using PG_TRY, must reset error stack by hand */
1370 0 : error_context_stack = NULL;
1371 :
1372 : /* Prevents interrupts while cleaning up */
1373 0 : HOLD_INTERRUPTS();
1374 :
1375 : /* Report the error to the server log */
1376 0 : EmitErrorReport();
1377 :
1378 : /*
1379 : * We can now go away. Note that because we called InitProcess, a
1380 : * callback was registered to do ProcKill, which will clean up
1381 : * necessary state.
1382 : */
1383 0 : proc_exit(0);
1384 : }
1385 :
1386 : /* We can now handle ereport(ERROR) */
1387 8 : PG_exception_stack = &local_sigjmp_buf;
1388 :
1389 : /* Setup signal handling */
1390 8 : pqsignal(SIGHUP, SignalHandlerForConfigReload);
1391 8 : pqsignal(SIGINT, SignalHandlerForShutdownRequest);
1392 8 : pqsignal(SIGTERM, die);
1393 8 : pqsignal(SIGFPE, FloatExceptionHandler);
1394 8 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1395 8 : pqsignal(SIGUSR2, SIG_IGN);
1396 8 : pqsignal(SIGPIPE, SIG_IGN);
1397 8 : pqsignal(SIGCHLD, SIG_DFL);
1398 :
1399 8 : check_and_set_sync_info(MyProcPid);
1400 :
1401 8 : ereport(LOG, errmsg("slot sync worker started"));
1402 :
1403 : /* Register it as soon as SlotSyncCtx->pid is initialized. */
1404 8 : before_shmem_exit(slotsync_worker_onexit, (Datum) 0);
1405 :
1406 : /*
1407 : * Establishes SIGALRM handler and initialize timeout module. It is needed
1408 : * by InitPostgres to register different timeouts.
1409 : */
1410 8 : InitializeTimeouts();
1411 :
1412 : /* Load the libpq-specific functions */
1413 8 : load_file("libpqwalreceiver", false);
1414 :
1415 : /*
1416 : * Unblock signals (they were blocked when the postmaster forked us)
1417 : */
1418 8 : sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
1419 :
1420 : /*
1421 : * Set always-secure search path, so malicious users can't redirect user
1422 : * code (e.g. operators).
1423 : *
1424 : * It's not strictly necessary since we won't be scanning or writing to
1425 : * any user table locally, but it's good to retain it here for added
1426 : * precaution.
1427 : */
1428 8 : SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
1429 :
1430 8 : dbname = CheckAndGetDbnameFromConninfo();
1431 :
1432 : /*
1433 : * Connect to the database specified by the user in primary_conninfo. We
1434 : * need a database connection for walrcv_exec to work which we use to
1435 : * fetch slot information from the remote node. See comments atop
1436 : * libpqrcv_exec.
1437 : *
1438 : * We do not specify a specific user here since the slot sync worker will
1439 : * operate as a superuser. This is safe because the slot sync worker does
1440 : * not interact with user tables, eliminating the risk of executing
1441 : * arbitrary code within triggers.
1442 : */
1443 8 : InitPostgres(dbname, InvalidOid, NULL, InvalidOid, 0, NULL);
1444 :
1445 8 : SetProcessingMode(NormalProcessing);
1446 :
1447 8 : initStringInfo(&app_name);
1448 8 : if (cluster_name[0])
1449 8 : appendStringInfo(&app_name, "%s_%s", cluster_name, "slotsync worker");
1450 : else
1451 0 : appendStringInfoString(&app_name, "slotsync worker");
1452 :
1453 : /*
1454 : * Establish the connection to the primary server for slot
1455 : * synchronization.
1456 : */
1457 8 : wrconn = walrcv_connect(PrimaryConnInfo, false, false, false,
1458 : app_name.data, &err);
1459 8 : pfree(app_name.data);
1460 :
1461 8 : if (!wrconn)
1462 0 : ereport(ERROR,
1463 : errcode(ERRCODE_CONNECTION_FAILURE),
1464 : errmsg("synchronization worker \"%s\" could not connect to the primary server: %s",
1465 : app_name.data, err));
1466 :
1467 : /*
1468 : * Register the disconnection callback.
1469 : *
1470 : * XXX: This can be combined with previous cleanup registration of
1471 : * slotsync_worker_onexit() but that will need the connection to be made
1472 : * global and we want to avoid introducing global for this purpose.
1473 : */
1474 8 : before_shmem_exit(slotsync_worker_disconnect, PointerGetDatum(wrconn));
1475 :
1476 : /*
1477 : * Using the specified primary server connection, check that we are not a
1478 : * cascading standby and slot configured in 'primary_slot_name' exists on
1479 : * the primary server.
1480 : */
1481 8 : validate_remote_info(wrconn);
1482 :
1483 : /* Main loop to synchronize slots */
1484 : for (;;)
1485 20 : {
1486 28 : bool some_slot_updated = false;
1487 :
1488 28 : ProcessSlotSyncInterrupts(wrconn);
1489 :
1490 20 : some_slot_updated = synchronize_slots(wrconn);
1491 :
1492 20 : wait_for_slot_activity(some_slot_updated);
1493 : }
1494 :
1495 : /*
1496 : * The slot sync worker can't get here because it will only stop when it
1497 : * receives a SIGINT from the startup process, or when there is an error.
1498 : */
1499 : Assert(false);
1500 : }
1501 :
1502 : /*
1503 : * Update the inactive_since property for synced slots.
1504 : *
1505 : * Note that this function is currently called when we shutdown the slot
1506 : * sync machinery.
1507 : */
1508 : static void
1509 1544 : update_synced_slots_inactive_since(void)
1510 : {
1511 1544 : TimestampTz now = 0;
1512 :
1513 : /*
1514 : * We need to update inactive_since only when we are promoting standby to
1515 : * correctly interpret the inactive_since if the standby gets promoted
1516 : * without a restart. We don't want the slots to appear inactive for a
1517 : * long time after promotion if they haven't been synchronized recently.
1518 : * Whoever acquires the slot, i.e., makes the slot active, will reset it.
1519 : */
1520 1544 : if (!StandbyMode)
1521 1454 : return;
1522 :
1523 : /* The slot sync worker or SQL function mustn't be running by now */
1524 : Assert((SlotSyncCtx->pid == InvalidPid) && !SlotSyncCtx->syncing);
1525 :
1526 90 : LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
1527 :
1528 966 : for (int i = 0; i < max_replication_slots; i++)
1529 : {
1530 876 : ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
1531 :
1532 : /* Check if it is a synchronized slot */
1533 876 : if (s->in_use && s->data.synced)
1534 : {
1535 : Assert(SlotIsLogical(s));
1536 :
1537 : /* The slot must not be acquired by any process */
1538 : Assert(s->active_pid == 0);
1539 :
1540 : /* Use the same inactive_since time for all the slots. */
1541 6 : if (now == 0)
1542 4 : now = GetCurrentTimestamp();
1543 :
1544 6 : SpinLockAcquire(&s->mutex);
1545 6 : s->inactive_since = now;
1546 6 : SpinLockRelease(&s->mutex);
1547 : }
1548 : }
1549 :
1550 90 : LWLockRelease(ReplicationSlotControlLock);
1551 : }
1552 :
1553 : /*
1554 : * Shut down the slot sync worker.
1555 : *
1556 : * This function sends signal to shutdown slot sync worker, if required. It
1557 : * also waits till the slot sync worker has exited or
1558 : * pg_sync_replication_slots() has finished.
1559 : */
1560 : void
1561 1544 : ShutDownSlotSync(void)
1562 : {
1563 : pid_t worker_pid;
1564 :
1565 1544 : SpinLockAcquire(&SlotSyncCtx->mutex);
1566 :
1567 1544 : SlotSyncCtx->stopSignaled = true;
1568 :
1569 : /*
1570 : * Return if neither the slot sync worker is running nor the function
1571 : * pg_sync_replication_slots() is executing.
1572 : */
1573 1544 : if (!SlotSyncCtx->syncing)
1574 : {
1575 1542 : SpinLockRelease(&SlotSyncCtx->mutex);
1576 1542 : update_synced_slots_inactive_since();
1577 1542 : return;
1578 : }
1579 :
1580 2 : worker_pid = SlotSyncCtx->pid;
1581 :
1582 2 : SpinLockRelease(&SlotSyncCtx->mutex);
1583 :
1584 2 : if (worker_pid != InvalidPid)
1585 2 : kill(worker_pid, SIGINT);
1586 :
1587 : /* Wait for slot sync to end */
1588 : for (;;)
1589 0 : {
1590 : int rc;
1591 :
1592 : /* Wait a bit, we don't expect to have to wait long */
1593 2 : rc = WaitLatch(MyLatch,
1594 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1595 : 10L, WAIT_EVENT_REPLICATION_SLOTSYNC_SHUTDOWN);
1596 :
1597 2 : if (rc & WL_LATCH_SET)
1598 : {
1599 0 : ResetLatch(MyLatch);
1600 0 : CHECK_FOR_INTERRUPTS();
1601 : }
1602 :
1603 2 : SpinLockAcquire(&SlotSyncCtx->mutex);
1604 :
1605 : /* Ensure that no process is syncing the slots. */
1606 2 : if (!SlotSyncCtx->syncing)
1607 2 : break;
1608 :
1609 0 : SpinLockRelease(&SlotSyncCtx->mutex);
1610 : }
1611 :
1612 2 : SpinLockRelease(&SlotSyncCtx->mutex);
1613 :
1614 2 : update_synced_slots_inactive_since();
1615 : }
1616 :
1617 : /*
1618 : * SlotSyncWorkerCanRestart
1619 : *
1620 : * Returns true if enough time (SLOTSYNC_RESTART_INTERVAL_SEC) has passed
1621 : * since it was launched last. Otherwise returns false.
1622 : *
1623 : * This is a safety valve to protect against continuous respawn attempts if the
1624 : * worker is dying immediately at launch. Note that since we will retry to
1625 : * launch the worker from the postmaster main loop, we will get another
1626 : * chance later.
1627 : */
1628 : bool
1629 10 : SlotSyncWorkerCanRestart(void)
1630 : {
1631 10 : time_t curtime = time(NULL);
1632 :
1633 : /* Return false if too soon since last start. */
1634 10 : if ((unsigned int) (curtime - SlotSyncCtx->last_start_time) <
1635 : (unsigned int) SLOTSYNC_RESTART_INTERVAL_SEC)
1636 2 : return false;
1637 :
1638 8 : SlotSyncCtx->last_start_time = curtime;
1639 :
1640 8 : return true;
1641 : }
1642 :
1643 : /*
1644 : * Is current process syncing replication slots?
1645 : *
1646 : * Could be either backend executing SQL function or slot sync worker.
1647 : */
1648 : bool
1649 40 : IsSyncingReplicationSlots(void)
1650 : {
1651 40 : return syncing_slots;
1652 : }
1653 :
1654 : /*
1655 : * Amount of shared memory required for slot synchronization.
1656 : */
1657 : Size
1658 5484 : SlotSyncShmemSize(void)
1659 : {
1660 5484 : return sizeof(SlotSyncCtxStruct);
1661 : }
1662 :
1663 : /*
1664 : * Allocate and initialize the shared memory of slot synchronization.
1665 : */
1666 : void
1667 1918 : SlotSyncShmemInit(void)
1668 : {
1669 1918 : Size size = SlotSyncShmemSize();
1670 : bool found;
1671 :
1672 1918 : SlotSyncCtx = (SlotSyncCtxStruct *)
1673 1918 : ShmemInitStruct("Slot Sync Data", size, &found);
1674 :
1675 1918 : if (!found)
1676 : {
1677 1918 : memset(SlotSyncCtx, 0, size);
1678 1918 : SlotSyncCtx->pid = InvalidPid;
1679 1918 : SpinLockInit(&SlotSyncCtx->mutex);
1680 : }
1681 1918 : }
1682 :
1683 : /*
1684 : * Error cleanup callback for slot sync SQL function.
1685 : */
1686 : static void
1687 2 : slotsync_failure_callback(int code, Datum arg)
1688 : {
1689 2 : WalReceiverConn *wrconn = (WalReceiverConn *) DatumGetPointer(arg);
1690 :
1691 : /*
1692 : * We need to do slots cleanup here just like WalSndErrorCleanup() does.
1693 : *
1694 : * The startup process during promotion invokes ShutDownSlotSync() which
1695 : * waits for slot sync to finish and it does that by checking the
1696 : * 'syncing' flag. Thus the SQL function must be done with slots' release
1697 : * and cleanup to avoid any dangling temporary slots or active slots
1698 : * before it marks itself as finished syncing.
1699 : */
1700 :
1701 : /* Make sure active replication slots are released */
1702 2 : if (MyReplicationSlot != NULL)
1703 0 : ReplicationSlotRelease();
1704 :
1705 : /* Also cleanup the synced temporary slots. */
1706 2 : ReplicationSlotCleanup(true);
1707 :
1708 : /*
1709 : * The set syncing_slots indicates that the process errored out without
1710 : * resetting the flag. So, we need to clean up shared memory and reset the
1711 : * flag here.
1712 : */
1713 2 : if (syncing_slots)
1714 2 : reset_syncing_flag();
1715 :
1716 2 : walrcv_disconnect(wrconn);
1717 2 : }
1718 :
1719 : /*
1720 : * Synchronize the failover enabled replication slots using the specified
1721 : * primary server connection.
1722 : */
1723 : void
1724 16 : SyncReplicationSlots(WalReceiverConn *wrconn)
1725 : {
1726 16 : PG_ENSURE_ERROR_CLEANUP(slotsync_failure_callback, PointerGetDatum(wrconn));
1727 : {
1728 16 : check_and_set_sync_info(InvalidPid);
1729 :
1730 16 : validate_remote_info(wrconn);
1731 :
1732 14 : synchronize_slots(wrconn);
1733 :
1734 : /* Cleanup the synced temporary slots */
1735 14 : ReplicationSlotCleanup(true);
1736 :
1737 : /* We are done with sync, so reset sync flag */
1738 14 : reset_syncing_flag();
1739 : }
1740 16 : PG_END_ENSURE_ERROR_CLEANUP(slotsync_failure_callback, PointerGetDatum(wrconn));
1741 14 : }
|