Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * standby.c
4 : * Misc functions used in Hot Standby mode.
5 : *
6 : * All functions for handling RM_STANDBY_ID, which relate to
7 : * AccessExclusiveLocks and starting snapshots for Hot Standby mode.
8 : * Plus conflict recovery processing.
9 : *
10 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
11 : * Portions Copyright (c) 1994, Regents of the University of California
12 : *
13 : * IDENTIFICATION
14 : * src/backend/storage/ipc/standby.c
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 : #include "access/transam.h"
20 : #include "access/twophase.h"
21 : #include "access/xact.h"
22 : #include "access/xloginsert.h"
23 : #include "access/xlogrecovery.h"
24 : #include "access/xlogutils.h"
25 : #include "miscadmin.h"
26 : #include "pgstat.h"
27 : #include "replication/slot.h"
28 : #include "storage/bufmgr.h"
29 : #include "storage/proc.h"
30 : #include "storage/procarray.h"
31 : #include "storage/sinvaladt.h"
32 : #include "storage/standby.h"
33 : #include "utils/hsearch.h"
34 : #include "utils/injection_point.h"
35 : #include "utils/ps_status.h"
36 : #include "utils/timeout.h"
37 : #include "utils/timestamp.h"
38 :
39 : /* User-settable GUC parameters */
40 : int max_standby_archive_delay = 30 * 1000;
41 : int max_standby_streaming_delay = 30 * 1000;
42 : bool log_recovery_conflict_waits = false;
43 :
44 : /*
45 : * Keep track of all the exclusive locks owned by original transactions.
46 : * For each known exclusive lock, there is a RecoveryLockEntry in the
47 : * RecoveryLockHash hash table. All RecoveryLockEntrys belonging to a
48 : * given XID are chained together so that we can find them easily.
49 : * For each original transaction that is known to have any such locks,
50 : * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
51 : * which stores the head of the chain of its locks.
52 : */
53 : typedef struct RecoveryLockEntry
54 : {
55 : xl_standby_lock key; /* hash key: xid, dbOid, relOid */
56 : struct RecoveryLockEntry *next; /* chain link */
57 : } RecoveryLockEntry;
58 :
59 : typedef struct RecoveryLockXidEntry
60 : {
61 : TransactionId xid; /* hash key -- must be first */
62 : struct RecoveryLockEntry *head; /* chain head */
63 : } RecoveryLockXidEntry;
64 :
65 : static HTAB *RecoveryLockHash = NULL;
66 : static HTAB *RecoveryLockXidHash = NULL;
67 :
68 : /* Flags set by timeout handlers */
69 : static volatile sig_atomic_t got_standby_deadlock_timeout = false;
70 : static volatile sig_atomic_t got_standby_delay_timeout = false;
71 : static volatile sig_atomic_t got_standby_lock_timeout = false;
72 :
73 : static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
74 : ProcSignalReason reason,
75 : uint32 wait_event_info,
76 : bool report_waiting);
77 : static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
78 : static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
79 : static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
80 : static const char *get_recovery_conflict_desc(ProcSignalReason reason);
81 :
82 : /*
83 : * InitRecoveryTransactionEnvironment
84 : * Initialize tracking of our primary's in-progress transactions.
85 : *
86 : * We need to issue shared invalidations and hold locks. Holding locks
87 : * means others may want to wait on us, so we need to make a lock table
88 : * vxact entry like a real transaction. We could create and delete
89 : * lock table entries for each transaction but its simpler just to create
90 : * one permanent entry and leave it there all the time. Locks are then
91 : * acquired and released as needed. Yes, this means you can see the
92 : * Startup process in pg_locks once we have run this.
93 : */
94 : void
95 208 : InitRecoveryTransactionEnvironment(void)
96 : {
97 : VirtualTransactionId vxid;
98 : HASHCTL hash_ctl;
99 :
100 : Assert(RecoveryLockHash == NULL); /* don't run this twice */
101 :
102 : /*
103 : * Initialize the hash tables for tracking the locks held by each
104 : * transaction.
105 : */
106 208 : hash_ctl.keysize = sizeof(xl_standby_lock);
107 208 : hash_ctl.entrysize = sizeof(RecoveryLockEntry);
108 208 : RecoveryLockHash = hash_create("RecoveryLockHash",
109 : 64,
110 : &hash_ctl,
111 : HASH_ELEM | HASH_BLOBS);
112 208 : hash_ctl.keysize = sizeof(TransactionId);
113 208 : hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
114 208 : RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
115 : 64,
116 : &hash_ctl,
117 : HASH_ELEM | HASH_BLOBS);
118 :
119 : /*
120 : * Initialize shared invalidation management for Startup process, being
121 : * careful to register ourselves as a sendOnly process so we don't need to
122 : * read messages, nor will we get signaled when the queue starts filling
123 : * up.
124 : */
125 208 : SharedInvalBackendInit(true);
126 :
127 : /*
128 : * Lock a virtual transaction id for Startup process.
129 : *
130 : * We need to do GetNextLocalTransactionId() because
131 : * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
132 : * manager doesn't like that at all.
133 : *
134 : * Note that we don't need to run XactLockTableInsert() because nobody
135 : * needs to wait on xids. That sounds a little strange, but table locks
136 : * are held by vxids and row level locks are held by xids. All queries
137 : * hold AccessShareLocks so never block while we write or lock new rows.
138 : */
139 208 : MyProc->vxid.procNumber = MyProcNumber;
140 208 : vxid.procNumber = MyProcNumber;
141 208 : vxid.localTransactionId = GetNextLocalTransactionId();
142 208 : VirtualXactLockTableInsert(vxid);
143 :
144 208 : standbyState = STANDBY_INITIALIZED;
145 208 : }
146 :
147 : /*
148 : * ShutdownRecoveryTransactionEnvironment
149 : * Shut down transaction tracking
150 : *
151 : * Prepare to switch from hot standby mode to normal operation. Shut down
152 : * recovery-time transaction tracking.
153 : *
154 : * This must be called even in shutdown of startup process if transaction
155 : * tracking has been initialized. Otherwise some locks the tracked
156 : * transactions were holding will not be released and may interfere with
157 : * the processes still running (but will exit soon later) at the exit of
158 : * startup process.
159 : */
160 : void
161 306 : ShutdownRecoveryTransactionEnvironment(void)
162 : {
163 : /*
164 : * Do nothing if RecoveryLockHash is NULL because that means that
165 : * transaction tracking has not yet been initialized or has already been
166 : * shut down. This makes it safe to have possibly-redundant calls of this
167 : * function during process exit.
168 : */
169 306 : if (RecoveryLockHash == NULL)
170 98 : return;
171 :
172 : /* Mark all tracked in-progress transactions as finished. */
173 208 : ExpireAllKnownAssignedTransactionIds();
174 :
175 : /* Release all locks the tracked transactions were holding */
176 208 : StandbyReleaseAllLocks();
177 :
178 : /* Destroy the lock hash tables. */
179 208 : hash_destroy(RecoveryLockHash);
180 208 : hash_destroy(RecoveryLockXidHash);
181 208 : RecoveryLockHash = NULL;
182 208 : RecoveryLockXidHash = NULL;
183 :
184 : /* Cleanup our VirtualTransaction */
185 208 : VirtualXactLockTableCleanup();
186 : }
187 :
188 :
189 : /*
190 : * -----------------------------------------------------
191 : * Standby wait timers and backend cancel logic
192 : * -----------------------------------------------------
193 : */
194 :
195 : /*
196 : * Determine the cutoff time at which we want to start canceling conflicting
197 : * transactions. Returns zero (a time safely in the past) if we are willing
198 : * to wait forever.
199 : */
200 : static TimestampTz
201 58 : GetStandbyLimitTime(void)
202 : {
203 : TimestampTz rtime;
204 : bool fromStream;
205 :
206 : /*
207 : * The cutoff time is the last WAL data receipt time plus the appropriate
208 : * delay variable. Delay of -1 means wait forever.
209 : */
210 58 : GetXLogReceiptTime(&rtime, &fromStream);
211 58 : if (fromStream)
212 : {
213 58 : if (max_standby_streaming_delay < 0)
214 0 : return 0; /* wait forever */
215 58 : return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
216 : }
217 : else
218 : {
219 0 : if (max_standby_archive_delay < 0)
220 0 : return 0; /* wait forever */
221 0 : return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
222 : }
223 : }
224 :
225 : #define STANDBY_INITIAL_WAIT_US 1000
226 : static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
227 :
228 : /*
229 : * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
230 : * We wait here for a while then return. If we decide we can't wait any
231 : * more then we return true, if we can wait some more return false.
232 : */
233 : static bool
234 30 : WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
235 : {
236 : TimestampTz ltime;
237 :
238 30 : CHECK_FOR_INTERRUPTS();
239 :
240 : /* Are we past the limit time? */
241 30 : ltime = GetStandbyLimitTime();
242 30 : if (ltime && GetCurrentTimestamp() >= ltime)
243 6 : return true;
244 :
245 : /*
246 : * Sleep a bit (this is essential to avoid busy-waiting).
247 : */
248 24 : pgstat_report_wait_start(wait_event_info);
249 24 : pg_usleep(standbyWait_us);
250 24 : pgstat_report_wait_end();
251 :
252 : /*
253 : * Progressively increase the sleep times, but not to more than 1s, since
254 : * pg_usleep isn't interruptible on some platforms.
255 : */
256 24 : standbyWait_us *= 2;
257 24 : if (standbyWait_us > 1000000)
258 0 : standbyWait_us = 1000000;
259 :
260 24 : return false;
261 : }
262 :
263 : /*
264 : * Log the recovery conflict.
265 : *
266 : * wait_start is the timestamp when the caller started to wait.
267 : * now is the timestamp when this function has been called.
268 : * wait_list is the list of virtual transaction ids assigned to
269 : * conflicting processes. still_waiting indicates whether
270 : * the startup process is still waiting for the recovery conflict
271 : * to be resolved or not.
272 : */
273 : void
274 20 : LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
275 : TimestampTz now, VirtualTransactionId *wait_list,
276 : bool still_waiting)
277 : {
278 : long secs;
279 : int usecs;
280 : long msecs;
281 : StringInfoData buf;
282 20 : int nprocs = 0;
283 :
284 : /*
285 : * There must be no conflicting processes when the recovery conflict has
286 : * already been resolved.
287 : */
288 : Assert(still_waiting || wait_list == NULL);
289 :
290 20 : TimestampDifference(wait_start, now, &secs, &usecs);
291 20 : msecs = secs * 1000 + usecs / 1000;
292 20 : usecs = usecs % 1000;
293 :
294 20 : if (wait_list)
295 : {
296 : VirtualTransactionId *vxids;
297 :
298 : /* Construct a string of list of the conflicting processes */
299 6 : vxids = wait_list;
300 12 : while (VirtualTransactionIdIsValid(*vxids))
301 : {
302 6 : PGPROC *proc = ProcNumberGetProc(vxids->procNumber);
303 :
304 : /* proc can be NULL if the target backend is not active */
305 6 : if (proc)
306 : {
307 6 : if (nprocs == 0)
308 : {
309 6 : initStringInfo(&buf);
310 6 : appendStringInfo(&buf, "%d", proc->pid);
311 : }
312 : else
313 0 : appendStringInfo(&buf, ", %d", proc->pid);
314 :
315 6 : nprocs++;
316 : }
317 :
318 6 : vxids++;
319 : }
320 : }
321 :
322 : /*
323 : * If wait_list is specified, report the list of PIDs of active
324 : * conflicting backends in a detail message. Note that if all the backends
325 : * in the list are not active, no detail message is logged.
326 : */
327 20 : if (still_waiting)
328 : {
329 10 : ereport(LOG,
330 : errmsg("recovery still waiting after %ld.%03d ms: %s",
331 : msecs, usecs, get_recovery_conflict_desc(reason)),
332 : nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
333 : "Conflicting processes: %s.",
334 : nprocs, buf.data) : 0);
335 : }
336 : else
337 : {
338 10 : ereport(LOG,
339 : errmsg("recovery finished waiting after %ld.%03d ms: %s",
340 : msecs, usecs, get_recovery_conflict_desc(reason)));
341 : }
342 :
343 20 : if (nprocs > 0)
344 6 : pfree(buf.data);
345 20 : }
346 :
347 : /*
348 : * This is the main executioner for any query backend that conflicts with
349 : * recovery processing. Judgement has already been passed on it within
350 : * a specific rmgr. Here we just issue the orders to the procs. The procs
351 : * then throw the required error as instructed.
352 : *
353 : * If report_waiting is true, "waiting" is reported in PS display and the
354 : * wait for recovery conflict is reported in the log, if necessary. If
355 : * the caller is responsible for reporting them, report_waiting should be
356 : * false. Otherwise, both the caller and this function report the same
357 : * thing unexpectedly.
358 : */
359 : static void
360 21568 : ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
361 : ProcSignalReason reason, uint32 wait_event_info,
362 : bool report_waiting)
363 : {
364 21568 : TimestampTz waitStart = 0;
365 21568 : bool waiting = false;
366 21568 : bool logged_recovery_conflict = false;
367 :
368 : /* Fast exit, to avoid a kernel call if there's no work to be done. */
369 21568 : if (!VirtualTransactionIdIsValid(*waitlist))
370 21562 : return;
371 :
372 : /* Set the wait start timestamp for reporting */
373 6 : if (report_waiting && (log_recovery_conflict_waits || update_process_title))
374 4 : waitStart = GetCurrentTimestamp();
375 :
376 12 : while (VirtualTransactionIdIsValid(*waitlist))
377 : {
378 : /* reset standbyWait_us for each xact we wait for */
379 6 : standbyWait_us = STANDBY_INITIAL_WAIT_US;
380 :
381 : /* wait until the virtual xid is gone */
382 36 : while (!VirtualXactLock(*waitlist, false))
383 : {
384 : /* Is it time to kill it? */
385 30 : if (WaitExceedsMaxStandbyDelay(wait_event_info))
386 : {
387 : pid_t pid;
388 :
389 : /*
390 : * Now find out who to throw out of the balloon.
391 : */
392 : Assert(VirtualTransactionIdIsValid(*waitlist));
393 6 : pid = CancelVirtualTransaction(*waitlist, reason);
394 :
395 : /*
396 : * Wait a little bit for it to die so that we avoid flooding
397 : * an unresponsive backend when system is heavily loaded.
398 : */
399 6 : if (pid != 0)
400 6 : pg_usleep(5000L);
401 : }
402 :
403 30 : if (waitStart != 0 && (!logged_recovery_conflict || !waiting))
404 : {
405 28 : TimestampTz now = 0;
406 : bool maybe_log_conflict;
407 : bool maybe_update_title;
408 :
409 28 : maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
410 28 : maybe_update_title = (update_process_title && !waiting);
411 :
412 : /* Get the current timestamp if not report yet */
413 28 : if (maybe_log_conflict || maybe_update_title)
414 28 : now = GetCurrentTimestamp();
415 :
416 : /*
417 : * Report via ps if we have been waiting for more than 500
418 : * msec (should that be configurable?)
419 : */
420 56 : if (maybe_update_title &&
421 28 : TimestampDifferenceExceeds(waitStart, now, 500))
422 : {
423 0 : set_ps_display_suffix("waiting");
424 0 : waiting = true;
425 : }
426 :
427 : /*
428 : * Emit the log message if the startup process is waiting
429 : * longer than deadlock_timeout for recovery conflict.
430 : */
431 44 : if (maybe_log_conflict &&
432 16 : TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
433 : {
434 4 : LogRecoveryConflict(reason, waitStart, now, waitlist, true);
435 4 : logged_recovery_conflict = true;
436 : }
437 : }
438 : }
439 :
440 : /* The virtual transaction is gone now, wait for the next one */
441 6 : waitlist++;
442 : }
443 :
444 : /*
445 : * Emit the log message if recovery conflict was resolved but the startup
446 : * process waited longer than deadlock_timeout for it.
447 : */
448 6 : if (logged_recovery_conflict)
449 4 : LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
450 : NULL, false);
451 :
452 : /* reset ps display to remove the suffix if we added one */
453 6 : if (waiting)
454 0 : set_ps_display_remove_suffix();
455 :
456 : }
457 :
458 : /*
459 : * Generate whatever recovery conflicts are needed to eliminate snapshots that
460 : * might see XIDs <= snapshotConflictHorizon as still running.
461 : *
462 : * snapshotConflictHorizon cutoffs are our standard approach to generating
463 : * granular recovery conflicts. Note that InvalidTransactionId values are
464 : * interpreted as "definitely don't need any conflicts" here, which is a
465 : * general convention that WAL records can (and often do) depend on.
466 : */
467 : void
468 28568 : ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
469 : bool isCatalogRel,
470 : RelFileLocator locator)
471 : {
472 : VirtualTransactionId *backends;
473 :
474 : /*
475 : * If we get passed InvalidTransactionId then we do nothing (no conflict).
476 : *
477 : * This can happen when replaying already-applied WAL records after a
478 : * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
479 : * record that marks as frozen a page which was already all-visible. It's
480 : * also quite common with records generated during index deletion
481 : * (original execution of the deletion can reason that a recovery conflict
482 : * which is sufficient for the deletion operation must take place before
483 : * replay of the deletion record itself).
484 : */
485 28568 : if (!TransactionIdIsValid(snapshotConflictHorizon))
486 7004 : return;
487 :
488 : Assert(TransactionIdIsNormal(snapshotConflictHorizon));
489 21564 : backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
490 : locator.dbOid);
491 21564 : ResolveRecoveryConflictWithVirtualXIDs(backends,
492 : PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
493 : WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
494 : true);
495 :
496 : /*
497 : * Note that WaitExceedsMaxStandbyDelay() is not taken into account here
498 : * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That
499 : * seems OK, given that this kind of conflict should not normally be
500 : * reached, e.g. due to using a physical replication slot.
501 : */
502 21564 : if (wal_level >= WAL_LEVEL_LOGICAL && isCatalogRel)
503 36 : InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid,
504 : snapshotConflictHorizon);
505 : }
506 :
507 : /*
508 : * Variant of ResolveRecoveryConflictWithSnapshot that works with
509 : * FullTransactionId values
510 : */
511 : void
512 0 : ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
513 : bool isCatalogRel,
514 : RelFileLocator locator)
515 : {
516 : /*
517 : * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
518 : * so truncate the logged FullTransactionId. If the logged value is very
519 : * old, so that XID wrap-around already happened on it, there can't be any
520 : * snapshots that still see it.
521 : */
522 0 : FullTransactionId nextXid = ReadNextFullTransactionId();
523 : uint64 diff;
524 :
525 0 : diff = U64FromFullTransactionId(nextXid) -
526 0 : U64FromFullTransactionId(snapshotConflictHorizon);
527 0 : if (diff < MaxTransactionId / 2)
528 : {
529 : TransactionId truncated;
530 :
531 0 : truncated = XidFromFullTransactionId(snapshotConflictHorizon);
532 0 : ResolveRecoveryConflictWithSnapshot(truncated,
533 : isCatalogRel,
534 : locator);
535 : }
536 0 : }
537 :
538 : void
539 2 : ResolveRecoveryConflictWithTablespace(Oid tsid)
540 : {
541 : VirtualTransactionId *temp_file_users;
542 :
543 : /*
544 : * Standby users may be currently using this tablespace for their
545 : * temporary files. We only care about current users because
546 : * temp_tablespace parameter will just ignore tablespaces that no longer
547 : * exist.
548 : *
549 : * Ask everybody to cancel their queries immediately so we can ensure no
550 : * temp files remain and we can remove the tablespace. Nuke the entire
551 : * site from orbit, it's the only way to be sure.
552 : *
553 : * XXX: We could work out the pids of active backends using this
554 : * tablespace by examining the temp filenames in the directory. We would
555 : * then convert the pids into VirtualXIDs before attempting to cancel
556 : * them.
557 : *
558 : * We don't wait for commit because drop tablespace is non-transactional.
559 : */
560 2 : temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
561 : InvalidOid);
562 2 : ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
563 : PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
564 : WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
565 : true);
566 2 : }
567 :
568 : void
569 26 : ResolveRecoveryConflictWithDatabase(Oid dbid)
570 : {
571 : /*
572 : * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
573 : * only waits for transactions and completely idle sessions would block
574 : * us. This is rare enough that we do this as simply as possible: no wait,
575 : * just force them off immediately.
576 : *
577 : * No locking is required here because we already acquired
578 : * AccessExclusiveLock. Anybody trying to connect while we do this will
579 : * block during InitPostgres() and then disconnect when they see the
580 : * database has been removed.
581 : */
582 30 : while (CountDBBackends(dbid) > 0)
583 : {
584 4 : CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
585 :
586 : /*
587 : * Wait awhile for them to die so that we avoid flooding an
588 : * unresponsive backend when system is heavily loaded.
589 : */
590 4 : pg_usleep(10000);
591 : }
592 26 : }
593 :
594 : /*
595 : * ResolveRecoveryConflictWithLock is called from ProcSleep()
596 : * to resolve conflicts with other backends holding relation locks.
597 : *
598 : * The WaitLatch sleep normally done in ProcSleep()
599 : * (when not InHotStandby) is performed here, for code clarity.
600 : *
601 : * We either resolve conflicts immediately or set a timeout to wake us at
602 : * the limit of our patience.
603 : *
604 : * Resolve conflicts by canceling to all backends holding a conflicting
605 : * lock. As we are already queued to be granted the lock, no new lock
606 : * requests conflicting with ours will be granted in the meantime.
607 : *
608 : * We also must check for deadlocks involving the Startup process and
609 : * hot-standby backend processes. If deadlock_timeout is reached in
610 : * this function, all the backends holding the conflicting locks are
611 : * requested to check themselves for deadlocks.
612 : *
613 : * logging_conflict should be true if the recovery conflict has not been
614 : * logged yet even though logging is enabled. After deadlock_timeout is
615 : * reached and the request for deadlock check is sent, we wait again to
616 : * be signaled by the release of the lock if logging_conflict is false.
617 : * Otherwise we return without waiting again so that the caller can report
618 : * the recovery conflict. In this case, then, this function is called again
619 : * with logging_conflict=false (because the recovery conflict has already
620 : * been logged) and we will wait again for the lock to be released.
621 : */
622 : void
623 8 : ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
624 : {
625 : TimestampTz ltime;
626 : TimestampTz now;
627 :
628 : Assert(InHotStandby);
629 :
630 8 : ltime = GetStandbyLimitTime();
631 8 : now = GetCurrentTimestamp();
632 :
633 : /*
634 : * Update waitStart if first time through after the startup process
635 : * started waiting for the lock. It should not be updated every time
636 : * ResolveRecoveryConflictWithLock() is called during the wait.
637 : *
638 : * Use the current time obtained for comparison with ltime as waitStart
639 : * (i.e., the time when this process started waiting for the lock). Since
640 : * getting the current time newly can cause overhead, we reuse the
641 : * already-obtained time to avoid that overhead.
642 : *
643 : * Note that waitStart is updated without holding the lock table's
644 : * partition lock, to avoid the overhead by additional lock acquisition.
645 : * This can cause "waitstart" in pg_locks to become NULL for a very short
646 : * period of time after the wait started even though "granted" is false.
647 : * This is OK in practice because we can assume that users are likely to
648 : * look at "waitstart" when waiting for the lock for a long time.
649 : */
650 8 : if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
651 2 : pg_atomic_write_u64(&MyProc->waitStart, now);
652 :
653 8 : if (now >= ltime && ltime != 0)
654 2 : {
655 : /*
656 : * We're already behind, so clear a path as quickly as possible.
657 : */
658 : VirtualTransactionId *backends;
659 :
660 2 : backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
661 :
662 : /*
663 : * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
664 : * "waiting" in PS display by disabling its argument report_waiting
665 : * because the caller, WaitOnLock(), has already reported that.
666 : */
667 2 : ResolveRecoveryConflictWithVirtualXIDs(backends,
668 : PROCSIG_RECOVERY_CONFLICT_LOCK,
669 2 : PG_WAIT_LOCK | locktag.locktag_type,
670 : false);
671 : }
672 : else
673 : {
674 : /*
675 : * Wait (or wait again) until ltime, and check for deadlocks as well
676 : * if we will be waiting longer than deadlock_timeout
677 : */
678 : EnableTimeoutParams timeouts[2];
679 6 : int cnt = 0;
680 :
681 6 : if (ltime != 0)
682 : {
683 6 : got_standby_lock_timeout = false;
684 6 : timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
685 6 : timeouts[cnt].type = TMPARAM_AT;
686 6 : timeouts[cnt].fin_time = ltime;
687 6 : cnt++;
688 : }
689 :
690 6 : got_standby_deadlock_timeout = false;
691 6 : timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
692 6 : timeouts[cnt].type = TMPARAM_AFTER;
693 6 : timeouts[cnt].delay_ms = DeadlockTimeout;
694 6 : cnt++;
695 :
696 6 : enable_timeouts(timeouts, cnt);
697 : }
698 :
699 : /* Wait to be signaled by the release of the Relation Lock */
700 8 : ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
701 :
702 : /*
703 : * Exit if ltime is reached. Then all the backends holding conflicting
704 : * locks will be canceled in the next ResolveRecoveryConflictWithLock()
705 : * call.
706 : */
707 8 : if (got_standby_lock_timeout)
708 0 : goto cleanup;
709 :
710 8 : if (got_standby_deadlock_timeout)
711 : {
712 : VirtualTransactionId *backends;
713 :
714 4 : backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
715 :
716 : /* Quick exit if there's no work to be done */
717 4 : if (!VirtualTransactionIdIsValid(*backends))
718 0 : goto cleanup;
719 :
720 : /*
721 : * Send signals to all the backends holding the conflicting locks, to
722 : * ask them to check themselves for deadlocks.
723 : */
724 8 : while (VirtualTransactionIdIsValid(*backends))
725 : {
726 4 : SignalVirtualTransaction(*backends,
727 : PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
728 : false);
729 4 : backends++;
730 : }
731 :
732 : /*
733 : * Exit if the recovery conflict has not been logged yet even though
734 : * logging is enabled, so that the caller can log that. Then
735 : * RecoveryConflictWithLock() is called again and we will wait again
736 : * for the lock to be released.
737 : */
738 4 : if (logging_conflict)
739 2 : goto cleanup;
740 :
741 : /*
742 : * Wait again here to be signaled by the release of the Relation Lock,
743 : * to prevent the subsequent RecoveryConflictWithLock() from causing
744 : * deadlock_timeout and sending a request for deadlocks check again.
745 : * Otherwise the request continues to be sent every deadlock_timeout
746 : * until the relation locks are released or ltime is reached.
747 : */
748 2 : got_standby_deadlock_timeout = false;
749 2 : ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
750 : }
751 :
752 4 : cleanup:
753 :
754 : /*
755 : * Clear any timeout requests established above. We assume here that the
756 : * Startup process doesn't have any other outstanding timeouts than those
757 : * used by this function. If that stops being true, we could cancel the
758 : * timeouts individually, but that'd be slower.
759 : */
760 8 : disable_all_timeouts(false);
761 8 : got_standby_lock_timeout = false;
762 8 : got_standby_deadlock_timeout = false;
763 8 : }
764 :
765 : /*
766 : * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
767 : * to resolve conflicts with other backends holding buffer pins.
768 : *
769 : * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
770 : * (when not InHotStandby) is performed here, for code clarity.
771 : *
772 : * We either resolve conflicts immediately or set a timeout to wake us at
773 : * the limit of our patience.
774 : *
775 : * Resolve conflicts by sending a PROCSIG signal to all backends to check if
776 : * they hold one of the buffer pins that is blocking Startup process. If so,
777 : * those backends will take an appropriate error action, ERROR or FATAL.
778 : *
779 : * We also must check for deadlocks. Deadlocks occur because if queries
780 : * wait on a lock, that must be behind an AccessExclusiveLock, which can only
781 : * be cleared if the Startup process replays a transaction completion record.
782 : * If Startup process is also waiting then that is a deadlock. The deadlock
783 : * can occur if the query is waiting and then the Startup sleeps, or if
784 : * Startup is sleeping and the query waits on a lock. We protect against
785 : * only the former sequence here, the latter sequence is checked prior to
786 : * the query sleeping, in CheckRecoveryConflictDeadlock().
787 : *
788 : * Deadlocks are extremely rare, and relatively expensive to check for,
789 : * so we don't do a deadlock check right away ... only if we have had to wait
790 : * at least deadlock_timeout.
791 : */
792 : void
793 20 : ResolveRecoveryConflictWithBufferPin(void)
794 : {
795 : TimestampTz ltime;
796 :
797 : Assert(InHotStandby);
798 :
799 20 : ltime = GetStandbyLimitTime();
800 :
801 20 : if (GetCurrentTimestamp() >= ltime && ltime != 0)
802 : {
803 : /*
804 : * We're already behind, so clear a path as quickly as possible.
805 : */
806 2 : SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
807 : }
808 : else
809 : {
810 : /*
811 : * Wake up at ltime, and check for deadlocks as well if we will be
812 : * waiting longer than deadlock_timeout
813 : */
814 : EnableTimeoutParams timeouts[2];
815 18 : int cnt = 0;
816 :
817 18 : if (ltime != 0)
818 : {
819 18 : timeouts[cnt].id = STANDBY_TIMEOUT;
820 18 : timeouts[cnt].type = TMPARAM_AT;
821 18 : timeouts[cnt].fin_time = ltime;
822 18 : cnt++;
823 : }
824 :
825 18 : got_standby_deadlock_timeout = false;
826 18 : timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
827 18 : timeouts[cnt].type = TMPARAM_AFTER;
828 18 : timeouts[cnt].delay_ms = DeadlockTimeout;
829 18 : cnt++;
830 :
831 18 : enable_timeouts(timeouts, cnt);
832 : }
833 :
834 : /*
835 : * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
836 : * by one of the timeouts established above.
837 : *
838 : * We assume that only UnpinBuffer() and the timeout requests established
839 : * above can wake us up here. WakeupRecovery() called by walreceiver or
840 : * SIGHUP signal handler, etc cannot do that because it uses the different
841 : * latch from that ProcWaitForSignal() waits on.
842 : */
843 20 : ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
844 :
845 20 : if (got_standby_delay_timeout)
846 2 : SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
847 18 : else if (got_standby_deadlock_timeout)
848 : {
849 : /*
850 : * Send out a request for hot-standby backends to check themselves for
851 : * deadlocks.
852 : *
853 : * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
854 : * to be signaled by UnpinBuffer() again and send a request for
855 : * deadlocks check if deadlock_timeout happens. This causes the
856 : * request to continue to be sent every deadlock_timeout until the
857 : * buffer is unpinned or ltime is reached. This would increase the
858 : * workload in the startup process and backends. In practice it may
859 : * not be so harmful because the period that the buffer is kept pinned
860 : * is basically no so long. But we should fix this?
861 : */
862 12 : SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
863 : }
864 :
865 : /*
866 : * Clear any timeout requests established above. We assume here that the
867 : * Startup process doesn't have any other timeouts than what this function
868 : * uses. If that stops being true, we could cancel the timeouts
869 : * individually, but that'd be slower.
870 : */
871 20 : disable_all_timeouts(false);
872 20 : got_standby_delay_timeout = false;
873 20 : got_standby_deadlock_timeout = false;
874 20 : }
875 :
876 : static void
877 16 : SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
878 : {
879 : Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
880 : reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
881 :
882 : /*
883 : * We send signal to all backends to ask them if they are holding the
884 : * buffer pin which is delaying the Startup process. We must not set the
885 : * conflict flag yet, since most backends will be innocent. Let the
886 : * SIGUSR1 handling in each backend decide their own fate.
887 : */
888 16 : CancelDBBackends(InvalidOid, reason, false);
889 16 : }
890 :
891 : /*
892 : * In Hot Standby perform early deadlock detection. We abort the lock
893 : * wait if we are about to sleep while holding the buffer pin that Startup
894 : * process is waiting for.
895 : *
896 : * Note: this code is pessimistic, because there is no way for it to
897 : * determine whether an actual deadlock condition is present: the lock we
898 : * need to wait for might be unrelated to any held by the Startup process.
899 : * Sooner or later, this mechanism should get ripped out in favor of somehow
900 : * accounting for buffer locks in DeadLockCheck(). However, errors here
901 : * seem to be very low-probability in practice, so for now it's not worth
902 : * the trouble.
903 : */
904 : void
905 2 : CheckRecoveryConflictDeadlock(void)
906 : {
907 : Assert(!InRecovery); /* do not call in Startup process */
908 :
909 2 : if (!HoldingBufferPinThatDelaysRecovery())
910 2 : return;
911 :
912 : /*
913 : * Error message should match ProcessInterrupts() but we avoid calling
914 : * that because we aren't handling an interrupt at this point. Note that
915 : * we only cancel the current transaction here, so if we are in a
916 : * subtransaction and the pin is held by a parent, then the Startup
917 : * process will continue to wait even though we have avoided deadlock.
918 : */
919 0 : ereport(ERROR,
920 : (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
921 : errmsg("canceling statement due to conflict with recovery"),
922 : errdetail("User transaction caused buffer deadlock with recovery.")));
923 : }
924 :
925 :
926 : /* --------------------------------
927 : * timeout handler routines
928 : * --------------------------------
929 : */
930 :
931 : /*
932 : * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
933 : * exceeded.
934 : */
935 : void
936 16 : StandbyDeadLockHandler(void)
937 : {
938 16 : got_standby_deadlock_timeout = true;
939 16 : }
940 :
941 : /*
942 : * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
943 : */
944 : void
945 2 : StandbyTimeoutHandler(void)
946 : {
947 2 : got_standby_delay_timeout = true;
948 2 : }
949 :
950 : /*
951 : * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
952 : */
953 : void
954 2 : StandbyLockTimeoutHandler(void)
955 : {
956 2 : got_standby_lock_timeout = true;
957 2 : }
958 :
959 : /*
960 : * -----------------------------------------------------
961 : * Locking in Recovery Mode
962 : * -----------------------------------------------------
963 : *
964 : * All locks are held by the Startup process using a single virtual
965 : * transaction. This implementation is both simpler and in some senses,
966 : * more correct. The locks held mean "some original transaction held
967 : * this lock, so query access is not allowed at this time". So the Startup
968 : * process is the proxy by which the original locks are implemented.
969 : *
970 : * We only keep track of AccessExclusiveLocks, which are only ever held by
971 : * one transaction on one relation.
972 : *
973 : * We keep a table of known locks in the RecoveryLockHash hash table.
974 : * The point of that table is to let us efficiently de-duplicate locks,
975 : * which is important because checkpoints will re-report the same locks
976 : * already held. There is also a RecoveryLockXidHash table with one entry
977 : * per xid, which allows us to efficiently find all the locks held by a
978 : * given original transaction.
979 : *
980 : * We use session locks rather than normal locks so we don't need
981 : * ResourceOwners.
982 : */
983 :
984 :
985 : void
986 50448 : StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
987 : {
988 : RecoveryLockXidEntry *xidentry;
989 : RecoveryLockEntry *lockentry;
990 : xl_standby_lock key;
991 : LOCKTAG locktag;
992 : bool found;
993 :
994 : /* Already processed? */
995 100896 : if (!TransactionIdIsValid(xid) ||
996 100890 : TransactionIdDidCommit(xid) ||
997 50442 : TransactionIdDidAbort(xid))
998 6 : return;
999 :
1000 50442 : elog(DEBUG4, "adding recovery lock: db %u rel %u", dbOid, relOid);
1001 :
1002 : /* dbOid is InvalidOid when we are locking a shared relation. */
1003 : Assert(OidIsValid(relOid));
1004 :
1005 : /* Create a hash entry for this xid, if we don't have one already. */
1006 50442 : xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
1007 50442 : if (!found)
1008 : {
1009 : Assert(xidentry->xid == xid); /* dynahash should have set this */
1010 20622 : xidentry->head = NULL;
1011 : }
1012 :
1013 : /* Create a hash entry for this lock, unless we have one already. */
1014 50442 : key.xid = xid;
1015 50442 : key.dbOid = dbOid;
1016 50442 : key.relOid = relOid;
1017 50442 : lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
1018 50442 : if (!found)
1019 : {
1020 : /* It's new, so link it into the XID's list ... */
1021 48106 : lockentry->next = xidentry->head;
1022 48106 : xidentry->head = lockentry;
1023 :
1024 : /* ... and acquire the lock locally. */
1025 48106 : SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
1026 :
1027 48106 : (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
1028 : }
1029 : }
1030 :
1031 : /*
1032 : * Release all the locks associated with this RecoveryLockXidEntry.
1033 : */
1034 : static void
1035 20622 : StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
1036 : {
1037 : RecoveryLockEntry *entry;
1038 : RecoveryLockEntry *next;
1039 :
1040 68728 : for (entry = xidentry->head; entry != NULL; entry = next)
1041 : {
1042 : LOCKTAG locktag;
1043 :
1044 48106 : elog(DEBUG4,
1045 : "releasing recovery lock: xid %u db %u rel %u",
1046 : entry->key.xid, entry->key.dbOid, entry->key.relOid);
1047 : /* Release the lock ... */
1048 48106 : SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
1049 48106 : if (!LockRelease(&locktag, AccessExclusiveLock, true))
1050 : {
1051 0 : elog(LOG,
1052 : "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
1053 : entry->key.xid, entry->key.dbOid, entry->key.relOid);
1054 : Assert(false);
1055 : }
1056 : /* ... and remove the per-lock hash entry */
1057 48106 : next = entry->next;
1058 48106 : hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
1059 : }
1060 :
1061 20622 : xidentry->head = NULL; /* just for paranoia */
1062 20622 : }
1063 :
1064 : /*
1065 : * Release locks for specific XID, or all locks if it's InvalidXid.
1066 : */
1067 : static void
1068 21978 : StandbyReleaseLocks(TransactionId xid)
1069 : {
1070 : RecoveryLockXidEntry *entry;
1071 :
1072 21978 : if (TransactionIdIsValid(xid))
1073 : {
1074 21978 : if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
1075 : {
1076 20622 : StandbyReleaseXidEntryLocks(entry);
1077 20622 : hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1078 : }
1079 : }
1080 : else
1081 0 : StandbyReleaseAllLocks();
1082 21978 : }
1083 :
1084 : /*
1085 : * Release locks for a transaction tree, starting at xid down, from
1086 : * RecoveryLockXidHash.
1087 : *
1088 : * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
1089 : * to remove any AccessExclusiveLocks requested by a transaction.
1090 : */
1091 : void
1092 20976 : StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
1093 : {
1094 : int i;
1095 :
1096 20976 : StandbyReleaseLocks(xid);
1097 :
1098 21978 : for (i = 0; i < nsubxids; i++)
1099 1002 : StandbyReleaseLocks(subxids[i]);
1100 20976 : }
1101 :
1102 : /*
1103 : * Called at end of recovery and when we see a shutdown checkpoint.
1104 : */
1105 : void
1106 208 : StandbyReleaseAllLocks(void)
1107 : {
1108 : HASH_SEQ_STATUS status;
1109 : RecoveryLockXidEntry *entry;
1110 :
1111 208 : elog(DEBUG2, "release all standby locks");
1112 :
1113 208 : hash_seq_init(&status, RecoveryLockXidHash);
1114 208 : while ((entry = hash_seq_search(&status)))
1115 : {
1116 0 : StandbyReleaseXidEntryLocks(entry);
1117 0 : hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1118 : }
1119 208 : }
1120 :
1121 : /*
1122 : * StandbyReleaseOldLocks
1123 : * Release standby locks held by top-level XIDs that aren't running,
1124 : * as long as they're not prepared transactions.
1125 : *
1126 : * This is needed to prune the locks of crashed transactions, which didn't
1127 : * write an ABORT/COMMIT record.
1128 : */
1129 : void
1130 1540 : StandbyReleaseOldLocks(TransactionId oldxid)
1131 : {
1132 : HASH_SEQ_STATUS status;
1133 : RecoveryLockXidEntry *entry;
1134 :
1135 1540 : hash_seq_init(&status, RecoveryLockXidHash);
1136 2122 : while ((entry = hash_seq_search(&status)))
1137 : {
1138 : Assert(TransactionIdIsValid(entry->xid));
1139 :
1140 : /* Skip if prepared transaction. */
1141 582 : if (StandbyTransactionIdIsPrepared(entry->xid))
1142 0 : continue;
1143 :
1144 : /* Skip if >= oldxid. */
1145 582 : if (!TransactionIdPrecedes(entry->xid, oldxid))
1146 582 : continue;
1147 :
1148 : /* Remove all locks and hash table entry. */
1149 0 : StandbyReleaseXidEntryLocks(entry);
1150 0 : hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1151 : }
1152 1540 : }
1153 :
1154 : /*
1155 : * --------------------------------------------------------------------
1156 : * Recovery handling for Rmgr RM_STANDBY_ID
1157 : *
1158 : * These record types will only be created if XLogStandbyInfoActive()
1159 : * --------------------------------------------------------------------
1160 : */
1161 :
1162 : void
1163 51302 : standby_redo(XLogReaderState *record)
1164 : {
1165 51302 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1166 :
1167 : /* Backup blocks are not used in standby records */
1168 : Assert(!XLogRecHasAnyBlockRefs(record));
1169 :
1170 : /* Do nothing if we're not in hot standby mode */
1171 51302 : if (standbyState == STANDBY_DISABLED)
1172 302 : return;
1173 :
1174 51000 : if (info == XLOG_STANDBY_LOCK)
1175 : {
1176 48420 : xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
1177 : int i;
1178 :
1179 98868 : for (i = 0; i < xlrec->nlocks; i++)
1180 50448 : StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
1181 : xlrec->locks[i].dbOid,
1182 : xlrec->locks[i].relOid);
1183 : }
1184 2580 : else if (info == XLOG_RUNNING_XACTS)
1185 : {
1186 1426 : xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
1187 : RunningTransactionsData running;
1188 :
1189 1426 : running.xcnt = xlrec->xcnt;
1190 1426 : running.subxcnt = xlrec->subxcnt;
1191 1426 : running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
1192 1426 : running.nextXid = xlrec->nextXid;
1193 1426 : running.latestCompletedXid = xlrec->latestCompletedXid;
1194 1426 : running.oldestRunningXid = xlrec->oldestRunningXid;
1195 1426 : running.xids = xlrec->xids;
1196 :
1197 1426 : ProcArrayApplyRecoveryInfo(&running);
1198 :
1199 : /*
1200 : * The startup process currently has no convenient way to schedule
1201 : * stats to be reported. XLOG_RUNNING_XACTS records issued at a
1202 : * regular cadence, making this a convenient location to report stats.
1203 : * While these records aren't generated with wal_level=minimal, stats
1204 : * also cannot be accessed during WAL replay.
1205 : */
1206 1426 : pgstat_report_stat(true);
1207 : }
1208 1154 : else if (info == XLOG_INVALIDATIONS)
1209 : {
1210 1154 : xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
1211 :
1212 1154 : ProcessCommittedInvalidationMessages(xlrec->msgs,
1213 : xlrec->nmsgs,
1214 1154 : xlrec->relcacheInitFileInval,
1215 : xlrec->dbId,
1216 : xlrec->tsId);
1217 : }
1218 : else
1219 0 : elog(PANIC, "standby_redo: unknown op code %u", info);
1220 : }
1221 :
1222 : /*
1223 : * Log details of the current snapshot to WAL. This allows the snapshot state
1224 : * to be reconstructed on the standby and for logical decoding.
1225 : *
1226 : * This is used for Hot Standby as follows:
1227 : *
1228 : * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
1229 : * start from a shutdown checkpoint because we know nothing was running
1230 : * at that time and our recovery snapshot is known empty. In the more
1231 : * typical case of an online checkpoint we need to jump through a few
1232 : * hoops to get a correct recovery snapshot and this requires a two or
1233 : * sometimes a three stage process.
1234 : *
1235 : * The initial snapshot must contain all running xids and all current
1236 : * AccessExclusiveLocks at a point in time on the standby. Assembling
1237 : * that information while the server is running requires many and
1238 : * various LWLocks, so we choose to derive that information piece by
1239 : * piece and then re-assemble that info on the standby. When that
1240 : * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
1241 : *
1242 : * Since locking on the primary when we derive the information is not
1243 : * strict, we note that there is a time window between the derivation and
1244 : * writing to WAL of the derived information. That allows race conditions
1245 : * that we must resolve, since xids and locks may enter or leave the
1246 : * snapshot during that window. This creates the issue that an xid or
1247 : * lock may start *after* the snapshot has been derived yet *before* the
1248 : * snapshot is logged in the running xacts WAL record. We resolve this by
1249 : * starting to accumulate changes at a point just prior to when we derive
1250 : * the snapshot on the primary, then ignore duplicates when we later apply
1251 : * the snapshot from the running xacts record. This is implemented during
1252 : * CreateCheckPoint() where we use the logical checkpoint location as
1253 : * our starting point and then write the running xacts record immediately
1254 : * before writing the main checkpoint WAL record. Since we always start
1255 : * up from a checkpoint and are immediately at our starting point, we
1256 : * unconditionally move to STANDBY_INITIALIZED. After this point we
1257 : * must do 4 things:
1258 : * * move shared nextXid forwards as we see new xids
1259 : * * extend the clog and subtrans with each new xid
1260 : * * keep track of uncommitted known assigned xids
1261 : * * keep track of uncommitted AccessExclusiveLocks
1262 : *
1263 : * When we see a commit/abort we must remove known assigned xids and locks
1264 : * from the completing transaction. Attempted removals that cannot locate
1265 : * an entry are expected and must not cause an error when we are in state
1266 : * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
1267 : * KnownAssignedXidsRemove().
1268 : *
1269 : * Later, when we apply the running xact data we must be careful to ignore
1270 : * transactions already committed, since those commits raced ahead when
1271 : * making WAL entries.
1272 : *
1273 : * For logical decoding only the running xacts information is needed;
1274 : * there's no need to look at the locking information, but it's logged anyway,
1275 : * as there's no independent knob to just enable logical decoding. For
1276 : * details of how this is used, check snapbuild.c's introductory comment.
1277 : *
1278 : *
1279 : * Returns the RecPtr of the last inserted record.
1280 : */
1281 : XLogRecPtr
1282 2650 : LogStandbySnapshot(void)
1283 : {
1284 : XLogRecPtr recptr;
1285 : RunningTransactions running;
1286 : xl_standby_lock *locks;
1287 : int nlocks;
1288 :
1289 : Assert(XLogStandbyInfoActive());
1290 :
1291 : #ifdef USE_INJECTION_POINTS
1292 2650 : if (IS_INJECTION_POINT_ATTACHED("skip-log-running-xacts"))
1293 : {
1294 : /*
1295 : * This record could move slot's xmin forward during decoding, leading
1296 : * to unpredictable results, so skip it when requested by the test.
1297 : */
1298 0 : return GetInsertRecPtr();
1299 : }
1300 : #endif
1301 :
1302 : /*
1303 : * Get details of any AccessExclusiveLocks being held at the moment.
1304 : */
1305 2650 : locks = GetRunningTransactionLocks(&nlocks);
1306 2650 : if (nlocks > 0)
1307 282 : LogAccessExclusiveLocks(nlocks, locks);
1308 2650 : pfree(locks);
1309 :
1310 : /*
1311 : * Log details of all in-progress transactions. This should be the last
1312 : * record we write, because standby will open up when it sees this.
1313 : */
1314 2650 : running = GetRunningTransactionData();
1315 :
1316 : /*
1317 : * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
1318 : * For Hot Standby this can be done before inserting the WAL record
1319 : * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
1320 : * the clog. For logical decoding, though, the lock can't be released
1321 : * early because the clog might be "in the future" from the POV of the
1322 : * historic snapshot. This would allow for situations where we're waiting
1323 : * for the end of a transaction listed in the xl_running_xacts record
1324 : * which, according to the WAL, has committed before the xl_running_xacts
1325 : * record. Fortunately this routine isn't executed frequently, and it's
1326 : * only a shared lock.
1327 : */
1328 2650 : if (wal_level < WAL_LEVEL_LOGICAL)
1329 1676 : LWLockRelease(ProcArrayLock);
1330 :
1331 2650 : recptr = LogCurrentRunningXacts(running);
1332 :
1333 : /* Release lock if we kept it longer ... */
1334 2650 : if (wal_level >= WAL_LEVEL_LOGICAL)
1335 974 : LWLockRelease(ProcArrayLock);
1336 :
1337 : /* GetRunningTransactionData() acquired XidGenLock, we must release it */
1338 2650 : LWLockRelease(XidGenLock);
1339 :
1340 2650 : return recptr;
1341 : }
1342 :
1343 : /*
1344 : * Record an enhanced snapshot of running transactions into WAL.
1345 : *
1346 : * The definitions of RunningTransactionsData and xl_running_xacts are
1347 : * similar. We keep them separate because xl_running_xacts is a contiguous
1348 : * chunk of memory and never exists fully until it is assembled in WAL.
1349 : * The inserted records are marked as not being important for durability,
1350 : * to avoid triggering superfluous checkpoint / archiving activity.
1351 : */
1352 : static XLogRecPtr
1353 2650 : LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
1354 : {
1355 : xl_running_xacts xlrec;
1356 : XLogRecPtr recptr;
1357 :
1358 2650 : xlrec.xcnt = CurrRunningXacts->xcnt;
1359 2650 : xlrec.subxcnt = CurrRunningXacts->subxcnt;
1360 2650 : xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY);
1361 2650 : xlrec.nextXid = CurrRunningXacts->nextXid;
1362 2650 : xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
1363 2650 : xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
1364 :
1365 : /* Header */
1366 2650 : XLogBeginInsert();
1367 2650 : XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
1368 2650 : XLogRegisterData(&xlrec, MinSizeOfXactRunningXacts);
1369 :
1370 : /* array of TransactionIds */
1371 2650 : if (xlrec.xcnt > 0)
1372 840 : XLogRegisterData(CurrRunningXacts->xids,
1373 840 : (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
1374 :
1375 2650 : recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
1376 :
1377 2650 : if (xlrec.subxid_overflow)
1378 2 : elog(DEBUG2,
1379 : "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1380 : CurrRunningXacts->xcnt,
1381 : LSN_FORMAT_ARGS(recptr),
1382 : CurrRunningXacts->oldestRunningXid,
1383 : CurrRunningXacts->latestCompletedXid,
1384 : CurrRunningXacts->nextXid);
1385 : else
1386 2648 : elog(DEBUG2,
1387 : "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1388 : CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
1389 : LSN_FORMAT_ARGS(recptr),
1390 : CurrRunningXacts->oldestRunningXid,
1391 : CurrRunningXacts->latestCompletedXid,
1392 : CurrRunningXacts->nextXid);
1393 :
1394 : /*
1395 : * Ensure running_xacts information is synced to disk not too far in the
1396 : * future. We don't want to stall anything though (i.e. use XLogFlush()),
1397 : * so we let the wal writer do it during normal operation.
1398 : * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
1399 : * and nudge the WALWriter into action if sleeping. Check
1400 : * XLogBackgroundFlush() for details why a record might not be flushed
1401 : * without it.
1402 : */
1403 2650 : XLogSetAsyncXactLSN(recptr);
1404 :
1405 2650 : return recptr;
1406 : }
1407 :
1408 : /*
1409 : * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
1410 : * logged, as described in backend/storage/lmgr/README.
1411 : */
1412 : static void
1413 200378 : LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
1414 : {
1415 : xl_standby_locks xlrec;
1416 :
1417 200378 : xlrec.nlocks = nlocks;
1418 :
1419 200378 : XLogBeginInsert();
1420 200378 : XLogRegisterData(&xlrec, offsetof(xl_standby_locks, locks));
1421 200378 : XLogRegisterData(locks, nlocks * sizeof(xl_standby_lock));
1422 200378 : XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
1423 :
1424 200378 : (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
1425 200378 : }
1426 :
1427 : /*
1428 : * Individual logging of AccessExclusiveLocks for use during LockAcquire()
1429 : */
1430 : void
1431 200096 : LogAccessExclusiveLock(Oid dbOid, Oid relOid)
1432 : {
1433 : xl_standby_lock xlrec;
1434 :
1435 200096 : xlrec.xid = GetCurrentTransactionId();
1436 :
1437 200096 : xlrec.dbOid = dbOid;
1438 200096 : xlrec.relOid = relOid;
1439 :
1440 200096 : LogAccessExclusiveLocks(1, &xlrec);
1441 200096 : MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
1442 200096 : }
1443 :
1444 : /*
1445 : * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
1446 : */
1447 : void
1448 200522 : LogAccessExclusiveLockPrepare(void)
1449 : {
1450 : /*
1451 : * Ensure that a TransactionId has been assigned to this transaction, for
1452 : * two reasons, both related to lock release on the standby. First, we
1453 : * must assign an xid so that RecordTransactionCommit() and
1454 : * RecordTransactionAbort() do not optimise away the transaction
1455 : * completion record which recovery relies upon to release locks. It's a
1456 : * hack, but for a corner case not worth adding code for into the main
1457 : * commit path. Second, we must assign an xid before the lock is recorded
1458 : * in shared memory, otherwise a concurrently executing
1459 : * GetRunningTransactionLocks() might see a lock associated with an
1460 : * InvalidTransactionId which we later assert cannot happen.
1461 : */
1462 200522 : (void) GetCurrentTransactionId();
1463 200522 : }
1464 :
1465 : /*
1466 : * Emit WAL for invalidations. This currently is only used for commits without
1467 : * an xid but which contain invalidations.
1468 : */
1469 : void
1470 16928 : LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
1471 : bool relcacheInitFileInval)
1472 : {
1473 : xl_invalidations xlrec;
1474 :
1475 : /* prepare record */
1476 16928 : memset(&xlrec, 0, sizeof(xlrec));
1477 16928 : xlrec.dbId = MyDatabaseId;
1478 16928 : xlrec.tsId = MyDatabaseTableSpace;
1479 16928 : xlrec.relcacheInitFileInval = relcacheInitFileInval;
1480 16928 : xlrec.nmsgs = nmsgs;
1481 :
1482 : /* perform insertion */
1483 16928 : XLogBeginInsert();
1484 16928 : XLogRegisterData(&xlrec, MinSizeOfInvalidations);
1485 16928 : XLogRegisterData(msgs,
1486 : nmsgs * sizeof(SharedInvalidationMessage));
1487 16928 : XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
1488 16928 : }
1489 :
1490 : /* Return the description of recovery conflict */
1491 : static const char *
1492 20 : get_recovery_conflict_desc(ProcSignalReason reason)
1493 : {
1494 20 : const char *reasonDesc = _("unknown reason");
1495 :
1496 20 : switch (reason)
1497 : {
1498 8 : case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
1499 8 : reasonDesc = _("recovery conflict on buffer pin");
1500 8 : break;
1501 4 : case PROCSIG_RECOVERY_CONFLICT_LOCK:
1502 4 : reasonDesc = _("recovery conflict on lock");
1503 4 : break;
1504 4 : case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
1505 4 : reasonDesc = _("recovery conflict on tablespace");
1506 4 : break;
1507 4 : case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
1508 4 : reasonDesc = _("recovery conflict on snapshot");
1509 4 : break;
1510 0 : case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT:
1511 0 : reasonDesc = _("recovery conflict on replication slot");
1512 0 : break;
1513 0 : case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
1514 0 : reasonDesc = _("recovery conflict on buffer deadlock");
1515 0 : break;
1516 0 : case PROCSIG_RECOVERY_CONFLICT_DATABASE:
1517 0 : reasonDesc = _("recovery conflict on database");
1518 0 : break;
1519 0 : default:
1520 0 : break;
1521 : }
1522 :
1523 20 : return reasonDesc;
1524 : }
|