Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * standby.c
4 : * Misc functions used in Hot Standby mode.
5 : *
6 : * All functions for handling RM_STANDBY_ID, which relate to
7 : * AccessExclusiveLocks and starting snapshots for Hot Standby mode.
8 : * Plus conflict recovery processing.
9 : *
10 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
11 : * Portions Copyright (c) 1994, Regents of the University of California
12 : *
13 : * IDENTIFICATION
14 : * src/backend/storage/ipc/standby.c
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 : #include "access/transam.h"
20 : #include "access/twophase.h"
21 : #include "access/xact.h"
22 : #include "access/xloginsert.h"
23 : #include "access/xlogrecovery.h"
24 : #include "access/xlogutils.h"
25 : #include "miscadmin.h"
26 : #include "pgstat.h"
27 : #include "replication/slot.h"
28 : #include "storage/bufmgr.h"
29 : #include "storage/proc.h"
30 : #include "storage/procarray.h"
31 : #include "storage/sinvaladt.h"
32 : #include "storage/standby.h"
33 : #include "utils/hsearch.h"
34 : #include "utils/injection_point.h"
35 : #include "utils/ps_status.h"
36 : #include "utils/timeout.h"
37 : #include "utils/timestamp.h"
38 : #include "utils/wait_event.h"
39 :
40 : /* User-settable GUC parameters */
41 : int max_standby_archive_delay = 30 * 1000;
42 : int max_standby_streaming_delay = 30 * 1000;
43 : bool log_recovery_conflict_waits = false;
44 :
45 : /*
46 : * Keep track of all the exclusive locks owned by original transactions.
47 : * For each known exclusive lock, there is a RecoveryLockEntry in the
48 : * RecoveryLockHash hash table. All RecoveryLockEntrys belonging to a
49 : * given XID are chained together so that we can find them easily.
50 : * For each original transaction that is known to have any such locks,
51 : * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
52 : * which stores the head of the chain of its locks.
53 : */
54 : typedef struct RecoveryLockEntry
55 : {
56 : xl_standby_lock key; /* hash key: xid, dbOid, relOid */
57 : struct RecoveryLockEntry *next; /* chain link */
58 : } RecoveryLockEntry;
59 :
60 : typedef struct RecoveryLockXidEntry
61 : {
62 : TransactionId xid; /* hash key -- must be first */
63 : struct RecoveryLockEntry *head; /* chain head */
64 : } RecoveryLockXidEntry;
65 :
66 : static HTAB *RecoveryLockHash = NULL;
67 : static HTAB *RecoveryLockXidHash = NULL;
68 :
69 : /* Flags set by timeout handlers */
70 : static volatile sig_atomic_t got_standby_deadlock_timeout = false;
71 : static volatile sig_atomic_t got_standby_delay_timeout = false;
72 : static volatile sig_atomic_t got_standby_lock_timeout = false;
73 :
74 : static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
75 : RecoveryConflictReason reason,
76 : uint32 wait_event_info,
77 : bool report_waiting);
78 : static void SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason);
79 : static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
80 : static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
81 : static const char *get_recovery_conflict_desc(RecoveryConflictReason reason);
82 :
83 : /*
84 : * InitRecoveryTransactionEnvironment
85 : * Initialize tracking of our primary's in-progress transactions.
86 : *
87 : * We need to issue shared invalidations and hold locks. Holding locks
88 : * means others may want to wait on us, so we need to make a lock table
89 : * vxact entry like a real transaction. We could create and delete
90 : * lock table entries for each transaction but its simpler just to create
91 : * one permanent entry and leave it there all the time. Locks are then
92 : * acquired and released as needed. Yes, this means you can see the
93 : * Startup process in pg_locks once we have run this.
94 : */
95 : void
96 115 : InitRecoveryTransactionEnvironment(void)
97 : {
98 : VirtualTransactionId vxid;
99 : HASHCTL hash_ctl;
100 :
101 : Assert(RecoveryLockHash == NULL); /* don't run this twice */
102 :
103 : /*
104 : * Initialize the hash tables for tracking the locks held by each
105 : * transaction.
106 : */
107 115 : hash_ctl.keysize = sizeof(xl_standby_lock);
108 115 : hash_ctl.entrysize = sizeof(RecoveryLockEntry);
109 115 : RecoveryLockHash = hash_create("RecoveryLockHash",
110 : 64,
111 : &hash_ctl,
112 : HASH_ELEM | HASH_BLOBS);
113 115 : hash_ctl.keysize = sizeof(TransactionId);
114 115 : hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
115 115 : RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
116 : 64,
117 : &hash_ctl,
118 : HASH_ELEM | HASH_BLOBS);
119 :
120 : /*
121 : * Initialize shared invalidation management for Startup process, being
122 : * careful to register ourselves as a sendOnly process so we don't need to
123 : * read messages, nor will we get signaled when the queue starts filling
124 : * up.
125 : */
126 115 : SharedInvalBackendInit(true);
127 :
128 : /*
129 : * Lock a virtual transaction id for Startup process.
130 : *
131 : * We need to do GetNextLocalTransactionId() because
132 : * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
133 : * manager doesn't like that at all.
134 : *
135 : * Note that we don't need to run XactLockTableInsert() because nobody
136 : * needs to wait on xids. That sounds a little strange, but table locks
137 : * are held by vxids and row level locks are held by xids. All queries
138 : * hold AccessShareLocks so never block while we write or lock new rows.
139 : */
140 115 : MyProc->vxid.procNumber = MyProcNumber;
141 115 : vxid.procNumber = MyProcNumber;
142 115 : vxid.localTransactionId = GetNextLocalTransactionId();
143 115 : VirtualXactLockTableInsert(vxid);
144 :
145 115 : standbyState = STANDBY_INITIALIZED;
146 115 : }
147 :
148 : /*
149 : * ShutdownRecoveryTransactionEnvironment
150 : * Shut down transaction tracking
151 : *
152 : * Prepare to switch from hot standby mode to normal operation. Shut down
153 : * recovery-time transaction tracking.
154 : *
155 : * This must be called even in shutdown of startup process if transaction
156 : * tracking has been initialized. Otherwise some locks the tracked
157 : * transactions were holding will not be released and may interfere with
158 : * the processes still running (but will exit soon later) at the exit of
159 : * startup process.
160 : */
161 : void
162 170 : ShutdownRecoveryTransactionEnvironment(void)
163 : {
164 : /*
165 : * Do nothing if RecoveryLockHash is NULL because that means that
166 : * transaction tracking has not yet been initialized or has already been
167 : * shut down. This makes it safe to have possibly-redundant calls of this
168 : * function during process exit.
169 : */
170 170 : if (RecoveryLockHash == NULL)
171 55 : return;
172 :
173 : /* Mark all tracked in-progress transactions as finished. */
174 115 : ExpireAllKnownAssignedTransactionIds();
175 :
176 : /* Release all locks the tracked transactions were holding */
177 115 : StandbyReleaseAllLocks();
178 :
179 : /* Destroy the lock hash tables. */
180 115 : hash_destroy(RecoveryLockHash);
181 115 : hash_destroy(RecoveryLockXidHash);
182 115 : RecoveryLockHash = NULL;
183 115 : RecoveryLockXidHash = NULL;
184 :
185 : /* Cleanup our VirtualTransaction */
186 115 : VirtualXactLockTableCleanup();
187 : }
188 :
189 :
190 : /*
191 : * -----------------------------------------------------
192 : * Standby wait timers and backend cancel logic
193 : * -----------------------------------------------------
194 : */
195 :
196 : /*
197 : * Determine the cutoff time at which we want to start canceling conflicting
198 : * transactions. Returns zero (a time safely in the past) if we are willing
199 : * to wait forever.
200 : */
201 : static TimestampTz
202 28 : GetStandbyLimitTime(void)
203 : {
204 : TimestampTz rtime;
205 : bool fromStream;
206 :
207 : /*
208 : * The cutoff time is the last WAL data receipt time plus the appropriate
209 : * delay variable. Delay of -1 means wait forever.
210 : */
211 28 : GetXLogReceiptTime(&rtime, &fromStream);
212 28 : if (fromStream)
213 : {
214 28 : if (max_standby_streaming_delay < 0)
215 0 : return 0; /* wait forever */
216 28 : return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
217 : }
218 : else
219 : {
220 0 : if (max_standby_archive_delay < 0)
221 0 : return 0; /* wait forever */
222 0 : return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
223 : }
224 : }
225 :
226 : #define STANDBY_INITIAL_WAIT_US 1000
227 : static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
228 :
229 : /*
230 : * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
231 : * We wait here for a while then return. If we decide we can't wait any
232 : * more then we return true, if we can wait some more return false.
233 : */
234 : static bool
235 16 : WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
236 : {
237 : TimestampTz ltime;
238 :
239 16 : CHECK_FOR_INTERRUPTS();
240 :
241 : /* Are we past the limit time? */
242 16 : ltime = GetStandbyLimitTime();
243 16 : if (ltime && GetCurrentTimestamp() >= ltime)
244 4 : return true;
245 :
246 : /*
247 : * Sleep a bit (this is essential to avoid busy-waiting).
248 : */
249 12 : pgstat_report_wait_start(wait_event_info);
250 12 : pg_usleep(standbyWait_us);
251 12 : pgstat_report_wait_end();
252 :
253 : /*
254 : * Progressively increase the sleep times, but not to more than 1s, since
255 : * pg_usleep isn't interruptible on some platforms.
256 : */
257 12 : standbyWait_us *= 2;
258 12 : if (standbyWait_us > 1000000)
259 0 : standbyWait_us = 1000000;
260 :
261 12 : return false;
262 : }
263 :
264 : /*
265 : * Log the recovery conflict.
266 : *
267 : * wait_start is the timestamp when the caller started to wait.
268 : * now is the timestamp when this function has been called.
269 : * wait_list is the list of virtual transaction ids assigned to
270 : * conflicting processes. still_waiting indicates whether
271 : * the startup process is still waiting for the recovery conflict
272 : * to be resolved or not.
273 : */
274 : void
275 10 : LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start,
276 : TimestampTz now, VirtualTransactionId *wait_list,
277 : bool still_waiting)
278 : {
279 : long secs;
280 : int usecs;
281 : long msecs;
282 : StringInfoData buf;
283 10 : int nprocs = 0;
284 :
285 : /*
286 : * There must be no conflicting processes when the recovery conflict has
287 : * already been resolved.
288 : */
289 : Assert(still_waiting || wait_list == NULL);
290 :
291 10 : TimestampDifference(wait_start, now, &secs, &usecs);
292 10 : msecs = secs * 1000 + usecs / 1000;
293 10 : usecs = usecs % 1000;
294 :
295 10 : if (wait_list)
296 : {
297 : VirtualTransactionId *vxids;
298 :
299 : /* Construct a string of list of the conflicting processes */
300 3 : vxids = wait_list;
301 6 : while (VirtualTransactionIdIsValid(*vxids))
302 : {
303 3 : PGPROC *proc = ProcNumberGetProc(vxids->procNumber);
304 :
305 : /* proc can be NULL if the target backend is not active */
306 3 : if (proc)
307 : {
308 3 : if (nprocs == 0)
309 : {
310 3 : initStringInfo(&buf);
311 3 : appendStringInfo(&buf, "%d", proc->pid);
312 : }
313 : else
314 0 : appendStringInfo(&buf, ", %d", proc->pid);
315 :
316 3 : nprocs++;
317 : }
318 :
319 3 : vxids++;
320 : }
321 : }
322 :
323 : /*
324 : * If wait_list is specified, report the list of PIDs of active
325 : * conflicting backends in a detail message. Note that if all the backends
326 : * in the list are not active, no detail message is logged.
327 : */
328 10 : if (still_waiting)
329 : {
330 5 : ereport(LOG,
331 : errmsg("recovery still waiting after %ld.%03d ms: %s",
332 : msecs, usecs, get_recovery_conflict_desc(reason)),
333 : nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
334 : "Conflicting processes: %s.",
335 : nprocs, buf.data) : 0);
336 : }
337 : else
338 : {
339 5 : ereport(LOG,
340 : errmsg("recovery finished waiting after %ld.%03d ms: %s",
341 : msecs, usecs, get_recovery_conflict_desc(reason)));
342 : }
343 :
344 10 : if (nprocs > 0)
345 3 : pfree(buf.data);
346 10 : }
347 :
348 : /*
349 : * This is the main executioner for any query backend that conflicts with
350 : * recovery processing. Judgement has already been passed on it within
351 : * a specific rmgr. Here we just issue the orders to the procs. The procs
352 : * then throw the required error as instructed.
353 : *
354 : * If report_waiting is true, "waiting" is reported in PS display and the
355 : * wait for recovery conflict is reported in the log, if necessary. If
356 : * the caller is responsible for reporting them, report_waiting should be
357 : * false. Otherwise, both the caller and this function report the same
358 : * thing unexpectedly.
359 : */
360 : static void
361 15044 : ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
362 : RecoveryConflictReason reason,
363 : uint32 wait_event_info,
364 : bool report_waiting)
365 : {
366 15044 : TimestampTz waitStart = 0;
367 15044 : bool waiting = false;
368 15044 : bool logged_recovery_conflict = false;
369 :
370 : /* Fast exit, to avoid a kernel call if there's no work to be done. */
371 15044 : if (!VirtualTransactionIdIsValid(*waitlist))
372 15041 : return;
373 :
374 : /* Set the wait start timestamp for reporting */
375 3 : if (report_waiting && (log_recovery_conflict_waits || update_process_title))
376 2 : waitStart = GetCurrentTimestamp();
377 :
378 6 : while (VirtualTransactionIdIsValid(*waitlist))
379 : {
380 : /* reset standbyWait_us for each xact we wait for */
381 3 : standbyWait_us = STANDBY_INITIAL_WAIT_US;
382 :
383 : /* wait until the virtual xid is gone */
384 19 : while (!VirtualXactLock(*waitlist, false))
385 : {
386 : /* Is it time to kill it? */
387 16 : if (WaitExceedsMaxStandbyDelay(wait_event_info))
388 : {
389 : bool signaled;
390 :
391 : /*
392 : * Now find out who to throw out of the balloon.
393 : */
394 : Assert(VirtualTransactionIdIsValid(*waitlist));
395 4 : signaled = SignalRecoveryConflictWithVirtualXID(*waitlist, reason);
396 :
397 : /*
398 : * Wait a little bit for it to die so that we avoid flooding
399 : * an unresponsive backend when system is heavily loaded.
400 : */
401 4 : if (signaled)
402 4 : pg_usleep(5000L);
403 : }
404 :
405 16 : if (waitStart != 0 && (!logged_recovery_conflict || !waiting))
406 : {
407 15 : TimestampTz now = 0;
408 : bool maybe_log_conflict;
409 : bool maybe_update_title;
410 :
411 15 : maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
412 15 : maybe_update_title = (update_process_title && !waiting);
413 :
414 : /* Get the current timestamp if not report yet */
415 15 : if (maybe_log_conflict || maybe_update_title)
416 15 : now = GetCurrentTimestamp();
417 :
418 : /*
419 : * Report via ps if we have been waiting for more than 500
420 : * msec (should that be configurable?)
421 : */
422 30 : if (maybe_update_title &&
423 15 : TimestampDifferenceExceeds(waitStart, now, 500))
424 : {
425 0 : set_ps_display_suffix("waiting");
426 0 : waiting = true;
427 : }
428 :
429 : /*
430 : * Emit the log message if the startup process is waiting
431 : * longer than deadlock_timeout for recovery conflict.
432 : */
433 22 : if (maybe_log_conflict &&
434 7 : TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
435 : {
436 2 : LogRecoveryConflict(reason, waitStart, now, waitlist, true);
437 2 : logged_recovery_conflict = true;
438 : }
439 : }
440 : }
441 :
442 : /* The virtual transaction is gone now, wait for the next one */
443 3 : waitlist++;
444 : }
445 :
446 : /*
447 : * Emit the log message if recovery conflict was resolved but the startup
448 : * process waited longer than deadlock_timeout for it.
449 : */
450 3 : if (logged_recovery_conflict)
451 2 : LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
452 : NULL, false);
453 :
454 : /* reset ps display to remove the suffix if we added one */
455 3 : if (waiting)
456 0 : set_ps_display_remove_suffix();
457 :
458 : }
459 :
460 : /*
461 : * Generate whatever recovery conflicts are needed to eliminate snapshots that
462 : * might see XIDs <= snapshotConflictHorizon as still running.
463 : *
464 : * snapshotConflictHorizon cutoffs are our standard approach to generating
465 : * granular recovery conflicts. Note that InvalidTransactionId values are
466 : * interpreted as "definitely don't need any conflicts" here, which is a
467 : * general convention that WAL records can (and often do) depend on.
468 : */
469 : void
470 18005 : ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
471 : bool isCatalogRel,
472 : RelFileLocator locator)
473 : {
474 : VirtualTransactionId *backends;
475 :
476 : /*
477 : * If we get passed InvalidTransactionId then we do nothing (no conflict).
478 : *
479 : * This can happen when replaying already-applied WAL records after a
480 : * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
481 : * record that marks as frozen a page which was already all-visible. It's
482 : * also quite common with records generated during index deletion
483 : * (original execution of the deletion can reason that a recovery conflict
484 : * which is sufficient for the deletion operation must take place before
485 : * replay of the deletion record itself).
486 : */
487 18005 : if (!TransactionIdIsValid(snapshotConflictHorizon))
488 2963 : return;
489 :
490 : Assert(TransactionIdIsNormal(snapshotConflictHorizon));
491 15042 : backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
492 : locator.dbOid);
493 15042 : ResolveRecoveryConflictWithVirtualXIDs(backends,
494 : RECOVERY_CONFLICT_SNAPSHOT,
495 : WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
496 : true);
497 :
498 : /*
499 : * Note that WaitExceedsMaxStandbyDelay() is not taken into account here
500 : * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That
501 : * seems OK, given that this kind of conflict should not normally be
502 : * reached, e.g. due to using a physical replication slot.
503 : */
504 15042 : if (IsLogicalDecodingEnabled() && isCatalogRel)
505 17 : InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid,
506 : snapshotConflictHorizon);
507 : }
508 :
509 : /*
510 : * Variant of ResolveRecoveryConflictWithSnapshot that works with
511 : * FullTransactionId values
512 : */
513 : void
514 75 : ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
515 : bool isCatalogRel,
516 : RelFileLocator locator)
517 : {
518 : /*
519 : * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
520 : * so truncate the logged FullTransactionId. If the logged value is very
521 : * old, so that XID wrap-around already happened on it, there can't be any
522 : * snapshots that still see it.
523 : */
524 75 : FullTransactionId nextXid = ReadNextFullTransactionId();
525 : uint64 diff;
526 :
527 75 : diff = U64FromFullTransactionId(nextXid) -
528 75 : U64FromFullTransactionId(snapshotConflictHorizon);
529 75 : if (diff < MaxTransactionId / 2)
530 : {
531 : TransactionId truncated;
532 :
533 75 : truncated = XidFromFullTransactionId(snapshotConflictHorizon);
534 75 : ResolveRecoveryConflictWithSnapshot(truncated,
535 : isCatalogRel,
536 : locator);
537 : }
538 75 : }
539 :
540 : void
541 1 : ResolveRecoveryConflictWithTablespace(Oid tsid)
542 : {
543 : VirtualTransactionId *temp_file_users;
544 :
545 : /*
546 : * Standby users may be currently using this tablespace for their
547 : * temporary files. We only care about current users because
548 : * temp_tablespace parameter will just ignore tablespaces that no longer
549 : * exist.
550 : *
551 : * Ask everybody to cancel their queries immediately so we can ensure no
552 : * temp files remain and we can remove the tablespace. Nuke the entire
553 : * site from orbit, it's the only way to be sure.
554 : *
555 : * XXX: We could work out the pids of active backends using this
556 : * tablespace by examining the temp filenames in the directory. We would
557 : * then convert the pids into VirtualXIDs before attempting to cancel
558 : * them.
559 : *
560 : * We don't wait for commit because drop tablespace is non-transactional.
561 : */
562 1 : temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
563 : InvalidOid);
564 1 : ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
565 : RECOVERY_CONFLICT_TABLESPACE,
566 : WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
567 : true);
568 1 : }
569 :
570 : void
571 14 : ResolveRecoveryConflictWithDatabase(Oid dbid)
572 : {
573 : /*
574 : * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
575 : * only waits for transactions and completely idle sessions would block
576 : * us. This is rare enough that we do this as simply as possible: no wait,
577 : * just force them off immediately.
578 : *
579 : * No locking is required here because we already acquired
580 : * AccessExclusiveLock. Anybody trying to connect while we do this will
581 : * block during InitPostgres() and then disconnect when they see the
582 : * database has been removed.
583 : */
584 16 : while (CountDBBackends(dbid) > 0)
585 : {
586 2 : SignalRecoveryConflictWithDatabase(dbid, RECOVERY_CONFLICT_DATABASE);
587 :
588 : /*
589 : * Wait awhile for them to die so that we avoid flooding an
590 : * unresponsive backend when system is heavily loaded.
591 : */
592 2 : pg_usleep(10000);
593 : }
594 14 : }
595 :
596 : /*
597 : * ResolveRecoveryConflictWithLock is called from ProcSleep()
598 : * to resolve conflicts with other backends holding relation locks.
599 : *
600 : * The WaitLatch sleep normally done in ProcSleep()
601 : * (when not InHotStandby) is performed here, for code clarity.
602 : *
603 : * We either resolve conflicts immediately or set a timeout to wake us at
604 : * the limit of our patience.
605 : *
606 : * Resolve conflicts by canceling to all backends holding a conflicting
607 : * lock. As we are already queued to be granted the lock, no new lock
608 : * requests conflicting with ours will be granted in the meantime.
609 : *
610 : * We also must check for deadlocks involving the Startup process and
611 : * hot-standby backend processes. If deadlock_timeout is reached in
612 : * this function, all the backends holding the conflicting locks are
613 : * requested to check themselves for deadlocks.
614 : *
615 : * logging_conflict should be true if the recovery conflict has not been
616 : * logged yet even though logging is enabled. After deadlock_timeout is
617 : * reached and the request for deadlock check is sent, we wait again to
618 : * be signaled by the release of the lock if logging_conflict is false.
619 : * Otherwise we return without waiting again so that the caller can report
620 : * the recovery conflict. In this case, then, this function is called again
621 : * with logging_conflict=false (because the recovery conflict has already
622 : * been logged) and we will wait again for the lock to be released.
623 : */
624 : void
625 3 : ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
626 : {
627 : TimestampTz ltime;
628 : TimestampTz now;
629 :
630 : Assert(InHotStandby);
631 :
632 3 : ltime = GetStandbyLimitTime();
633 3 : now = GetCurrentTimestamp();
634 :
635 : /*
636 : * Update waitStart if first time through after the startup process
637 : * started waiting for the lock. It should not be updated every time
638 : * ResolveRecoveryConflictWithLock() is called during the wait.
639 : *
640 : * Use the current time obtained for comparison with ltime as waitStart
641 : * (i.e., the time when this process started waiting for the lock). Since
642 : * getting the current time newly can cause overhead, we reuse the
643 : * already-obtained time to avoid that overhead.
644 : *
645 : * Note that waitStart is updated without holding the lock table's
646 : * partition lock, to avoid the overhead by additional lock acquisition.
647 : * This can cause "waitstart" in pg_locks to become NULL for a very short
648 : * period of time after the wait started even though "granted" is false.
649 : * This is OK in practice because we can assume that users are likely to
650 : * look at "waitstart" when waiting for the lock for a long time.
651 : */
652 3 : if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
653 1 : pg_atomic_write_u64(&MyProc->waitStart, now);
654 :
655 3 : if (now >= ltime && ltime != 0)
656 1 : {
657 : /*
658 : * We're already behind, so clear a path as quickly as possible.
659 : */
660 : VirtualTransactionId *backends;
661 :
662 1 : backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
663 :
664 : /*
665 : * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
666 : * "waiting" in PS display by disabling its argument report_waiting
667 : * because the caller, WaitOnLock(), has already reported that.
668 : */
669 1 : ResolveRecoveryConflictWithVirtualXIDs(backends,
670 : RECOVERY_CONFLICT_LOCK,
671 1 : PG_WAIT_LOCK | locktag.locktag_type,
672 : false);
673 : }
674 : else
675 : {
676 : /*
677 : * Wait (or wait again) until ltime, and check for deadlocks as well
678 : * if we will be waiting longer than deadlock_timeout
679 : */
680 : EnableTimeoutParams timeouts[2];
681 2 : int cnt = 0;
682 :
683 2 : if (ltime != 0)
684 : {
685 2 : got_standby_lock_timeout = false;
686 2 : timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
687 2 : timeouts[cnt].type = TMPARAM_AT;
688 2 : timeouts[cnt].fin_time = ltime;
689 2 : cnt++;
690 : }
691 :
692 2 : got_standby_deadlock_timeout = false;
693 2 : timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
694 2 : timeouts[cnt].type = TMPARAM_AFTER;
695 2 : timeouts[cnt].delay_ms = DeadlockTimeout;
696 2 : cnt++;
697 :
698 2 : enable_timeouts(timeouts, cnt);
699 : }
700 :
701 : /* Wait to be signaled by the release of the Relation Lock */
702 3 : ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
703 :
704 : /*
705 : * Exit if ltime is reached. Then all the backends holding conflicting
706 : * locks will be canceled in the next ResolveRecoveryConflictWithLock()
707 : * call.
708 : */
709 3 : if (got_standby_lock_timeout)
710 0 : goto cleanup;
711 :
712 3 : if (got_standby_deadlock_timeout)
713 : {
714 : VirtualTransactionId *backends;
715 :
716 2 : backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
717 :
718 : /* Quick exit if there's no work to be done */
719 2 : if (!VirtualTransactionIdIsValid(*backends))
720 0 : goto cleanup;
721 :
722 : /*
723 : * Send signals to all the backends holding the conflicting locks, to
724 : * ask them to check themselves for deadlocks.
725 : */
726 4 : while (VirtualTransactionIdIsValid(*backends))
727 : {
728 2 : (void) SignalRecoveryConflictWithVirtualXID(*backends,
729 : RECOVERY_CONFLICT_STARTUP_DEADLOCK);
730 2 : backends++;
731 : }
732 :
733 : /*
734 : * Exit if the recovery conflict has not been logged yet even though
735 : * logging is enabled, so that the caller can log that. Then
736 : * RecoveryConflictWithLock() is called again and we will wait again
737 : * for the lock to be released.
738 : */
739 2 : if (logging_conflict)
740 1 : goto cleanup;
741 :
742 : /*
743 : * Wait again here to be signaled by the release of the Relation Lock,
744 : * to prevent the subsequent RecoveryConflictWithLock() from causing
745 : * deadlock_timeout and sending a request for deadlocks check again.
746 : * Otherwise the request continues to be sent every deadlock_timeout
747 : * until the relation locks are released or ltime is reached.
748 : */
749 1 : got_standby_deadlock_timeout = false;
750 1 : ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
751 : }
752 :
753 1 : cleanup:
754 :
755 : /*
756 : * Clear any timeout requests established above. We assume here that the
757 : * Startup process doesn't have any other outstanding timeouts than those
758 : * used by this function. If that stops being true, we could cancel the
759 : * timeouts individually, but that'd be slower.
760 : */
761 3 : disable_all_timeouts(false);
762 3 : got_standby_lock_timeout = false;
763 3 : got_standby_deadlock_timeout = false;
764 3 : }
765 :
766 : /*
767 : * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
768 : * to resolve conflicts with other backends holding buffer pins.
769 : *
770 : * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
771 : * (when not InHotStandby) is performed here, for code clarity.
772 : *
773 : * We either resolve conflicts immediately or set a timeout to wake us at
774 : * the limit of our patience.
775 : *
776 : * Resolve conflicts by sending a PROCSIG signal to all backends to check if
777 : * they hold one of the buffer pins that is blocking Startup process. If so,
778 : * those backends will take an appropriate error action, ERROR or FATAL.
779 : *
780 : * We also must check for deadlocks. Deadlocks occur because if queries
781 : * wait on a lock, that must be behind an AccessExclusiveLock, which can only
782 : * be cleared if the Startup process replays a transaction completion record.
783 : * If Startup process is also waiting then that is a deadlock. The deadlock
784 : * can occur if the query is waiting and then the Startup sleeps, or if
785 : * Startup is sleeping and the query waits on a lock. We protect against
786 : * only the former sequence here, the latter sequence is checked prior to
787 : * the query sleeping, in CheckRecoveryConflictDeadlock().
788 : *
789 : * Deadlocks are extremely rare, and relatively expensive to check for,
790 : * so we don't do a deadlock check right away ... only if we have had to wait
791 : * at least deadlock_timeout.
792 : */
793 : void
794 9 : ResolveRecoveryConflictWithBufferPin(void)
795 : {
796 : TimestampTz ltime;
797 :
798 : Assert(InHotStandby);
799 :
800 9 : ltime = GetStandbyLimitTime();
801 :
802 9 : if (GetCurrentTimestamp() >= ltime && ltime != 0)
803 : {
804 : /*
805 : * We're already behind, so clear a path as quickly as possible.
806 : */
807 1 : SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
808 : }
809 : else
810 : {
811 : /*
812 : * Wake up at ltime, and check for deadlocks as well if we will be
813 : * waiting longer than deadlock_timeout
814 : */
815 : EnableTimeoutParams timeouts[2];
816 8 : int cnt = 0;
817 :
818 8 : if (ltime != 0)
819 : {
820 8 : timeouts[cnt].id = STANDBY_TIMEOUT;
821 8 : timeouts[cnt].type = TMPARAM_AT;
822 8 : timeouts[cnt].fin_time = ltime;
823 8 : cnt++;
824 : }
825 :
826 8 : got_standby_deadlock_timeout = false;
827 8 : timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
828 8 : timeouts[cnt].type = TMPARAM_AFTER;
829 8 : timeouts[cnt].delay_ms = DeadlockTimeout;
830 8 : cnt++;
831 :
832 8 : enable_timeouts(timeouts, cnt);
833 : }
834 :
835 : /*
836 : * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
837 : * by one of the timeouts established above.
838 : *
839 : * We assume that only UnpinBuffer() and the timeout requests established
840 : * above can wake us up here. WakeupRecovery() called by walreceiver or
841 : * SIGHUP signal handler, etc cannot do that because it uses the different
842 : * latch from that ProcWaitForSignal() waits on.
843 : */
844 9 : ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
845 :
846 9 : if (got_standby_delay_timeout)
847 1 : SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
848 8 : else if (got_standby_deadlock_timeout)
849 : {
850 : /*
851 : * Send out a request for hot-standby backends to check themselves for
852 : * deadlocks.
853 : *
854 : * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
855 : * to be signaled by UnpinBuffer() again and send a request for
856 : * deadlocks check if deadlock_timeout happens. This causes the
857 : * request to continue to be sent every deadlock_timeout until the
858 : * buffer is unpinned or ltime is reached. This would increase the
859 : * workload in the startup process and backends. In practice it may
860 : * not be so harmful because the period that the buffer is kept pinned
861 : * is basically no so long. But we should fix this?
862 : */
863 5 : SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
864 : }
865 :
866 : /*
867 : * Clear any timeout requests established above. We assume here that the
868 : * Startup process doesn't have any other timeouts than what this function
869 : * uses. If that stops being true, we could cancel the timeouts
870 : * individually, but that'd be slower.
871 : */
872 9 : disable_all_timeouts(false);
873 9 : got_standby_delay_timeout = false;
874 9 : got_standby_deadlock_timeout = false;
875 9 : }
876 :
877 : static void
878 7 : SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason)
879 : {
880 : Assert(reason == RECOVERY_CONFLICT_BUFFERPIN ||
881 : reason == RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
882 :
883 : /*
884 : * We send signal to all backends to ask them if they are holding the
885 : * buffer pin which is delaying the Startup process. Most of them will be
886 : * innocent, but we let the SIGUSR1 handling in each backend decide their
887 : * own fate.
888 : */
889 7 : SignalRecoveryConflictWithDatabase(InvalidOid, reason);
890 7 : }
891 :
892 : /*
893 : * In Hot Standby perform early deadlock detection. We abort the lock
894 : * wait if we are about to sleep while holding the buffer pin that Startup
895 : * process is waiting for.
896 : *
897 : * Note: this code is pessimistic, because there is no way for it to
898 : * determine whether an actual deadlock condition is present: the lock we
899 : * need to wait for might be unrelated to any held by the Startup process.
900 : * Sooner or later, this mechanism should get ripped out in favor of somehow
901 : * accounting for buffer locks in DeadLockCheck(). However, errors here
902 : * seem to be very low-probability in practice, so for now it's not worth
903 : * the trouble.
904 : */
905 : void
906 1 : CheckRecoveryConflictDeadlock(void)
907 : {
908 : Assert(!InRecovery); /* do not call in Startup process */
909 :
910 1 : if (!HoldingBufferPinThatDelaysRecovery())
911 1 : return;
912 :
913 : /*
914 : * Error message should match ProcessInterrupts() but we avoid calling
915 : * that because we aren't handling an interrupt at this point. Note that
916 : * we only cancel the current transaction here, so if we are in a
917 : * subtransaction and the pin is held by a parent, then the Startup
918 : * process will continue to wait even though we have avoided deadlock.
919 : */
920 0 : ereport(ERROR,
921 : (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
922 : errmsg("canceling statement due to conflict with recovery"),
923 : errdetail("User transaction caused buffer deadlock with recovery.")));
924 : }
925 :
926 :
927 : /* --------------------------------
928 : * timeout handler routines
929 : * --------------------------------
930 : */
931 :
932 : /*
933 : * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
934 : * exceeded.
935 : */
936 : void
937 7 : StandbyDeadLockHandler(void)
938 : {
939 7 : got_standby_deadlock_timeout = true;
940 7 : }
941 :
942 : /*
943 : * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
944 : */
945 : void
946 1 : StandbyTimeoutHandler(void)
947 : {
948 1 : got_standby_delay_timeout = true;
949 1 : }
950 :
951 : /*
952 : * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
953 : */
954 : void
955 1 : StandbyLockTimeoutHandler(void)
956 : {
957 1 : got_standby_lock_timeout = true;
958 1 : }
959 :
960 : /*
961 : * -----------------------------------------------------
962 : * Locking in Recovery Mode
963 : * -----------------------------------------------------
964 : *
965 : * All locks are held by the Startup process using a single virtual
966 : * transaction. This implementation is both simpler and in some senses,
967 : * more correct. The locks held mean "some original transaction held
968 : * this lock, so query access is not allowed at this time". So the Startup
969 : * process is the proxy by which the original locks are implemented.
970 : *
971 : * We only keep track of AccessExclusiveLocks, which are only ever held by
972 : * one transaction on one relation.
973 : *
974 : * We keep a table of known locks in the RecoveryLockHash hash table.
975 : * The point of that table is to let us efficiently de-duplicate locks,
976 : * which is important because checkpoints will re-report the same locks
977 : * already held. There is also a RecoveryLockXidHash table with one entry
978 : * per xid, which allows us to efficiently find all the locks held by a
979 : * given original transaction.
980 : *
981 : * We use session locks rather than normal locks so we don't need
982 : * ResourceOwners.
983 : */
984 :
985 :
986 : void
987 27567 : StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
988 : {
989 : RecoveryLockXidEntry *xidentry;
990 : RecoveryLockEntry *lockentry;
991 : xl_standby_lock key;
992 : LOCKTAG locktag;
993 : bool found;
994 :
995 : /* Already processed? */
996 55134 : if (!TransactionIdIsValid(xid) ||
997 55119 : TransactionIdDidCommit(xid) ||
998 27552 : TransactionIdDidAbort(xid))
999 15 : return;
1000 :
1001 27552 : elog(DEBUG4, "adding recovery lock: db %u rel %u", dbOid, relOid);
1002 :
1003 : /* dbOid is InvalidOid when we are locking a shared relation. */
1004 : Assert(OidIsValid(relOid));
1005 :
1006 : /* Create a hash entry for this xid, if we don't have one already. */
1007 27552 : xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
1008 27552 : if (!found)
1009 : {
1010 : Assert(xidentry->xid == xid); /* dynahash should have set this */
1011 11344 : xidentry->head = NULL;
1012 : }
1013 :
1014 : /* Create a hash entry for this lock, unless we have one already. */
1015 27552 : key.xid = xid;
1016 27552 : key.dbOid = dbOid;
1017 27552 : key.relOid = relOid;
1018 27552 : lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
1019 27552 : if (!found)
1020 : {
1021 : /* It's new, so link it into the XID's list ... */
1022 26471 : lockentry->next = xidentry->head;
1023 26471 : xidentry->head = lockentry;
1024 :
1025 : /* ... and acquire the lock locally. */
1026 26471 : SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
1027 :
1028 26471 : (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
1029 : }
1030 : }
1031 :
1032 : /*
1033 : * Release all the locks associated with this RecoveryLockXidEntry.
1034 : */
1035 : static void
1036 11344 : StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
1037 : {
1038 : RecoveryLockEntry *entry;
1039 : RecoveryLockEntry *next;
1040 :
1041 37815 : for (entry = xidentry->head; entry != NULL; entry = next)
1042 : {
1043 : LOCKTAG locktag;
1044 :
1045 26471 : elog(DEBUG4,
1046 : "releasing recovery lock: xid %u db %u rel %u",
1047 : entry->key.xid, entry->key.dbOid, entry->key.relOid);
1048 : /* Release the lock ... */
1049 26471 : SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
1050 26471 : if (!LockRelease(&locktag, AccessExclusiveLock, true))
1051 : {
1052 0 : elog(LOG,
1053 : "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
1054 : entry->key.xid, entry->key.dbOid, entry->key.relOid);
1055 : Assert(false);
1056 : }
1057 : /* ... and remove the per-lock hash entry */
1058 26471 : next = entry->next;
1059 26471 : hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
1060 : }
1061 :
1062 11344 : xidentry->head = NULL; /* just for paranoia */
1063 11344 : }
1064 :
1065 : /*
1066 : * Release locks for specific XID, or all locks if it's InvalidXid.
1067 : */
1068 : static void
1069 12024 : StandbyReleaseLocks(TransactionId xid)
1070 : {
1071 : RecoveryLockXidEntry *entry;
1072 :
1073 12024 : if (TransactionIdIsValid(xid))
1074 : {
1075 12024 : if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
1076 : {
1077 11344 : StandbyReleaseXidEntryLocks(entry);
1078 11344 : hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1079 : }
1080 : }
1081 : else
1082 0 : StandbyReleaseAllLocks();
1083 12024 : }
1084 :
1085 : /*
1086 : * Release locks for a transaction tree, starting at xid down, from
1087 : * RecoveryLockXidHash.
1088 : *
1089 : * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
1090 : * to remove any AccessExclusiveLocks requested by a transaction.
1091 : */
1092 : void
1093 11524 : StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
1094 : {
1095 : int i;
1096 :
1097 11524 : StandbyReleaseLocks(xid);
1098 :
1099 12024 : for (i = 0; i < nsubxids; i++)
1100 500 : StandbyReleaseLocks(subxids[i]);
1101 11524 : }
1102 :
1103 : /*
1104 : * Called at end of recovery and when we see a shutdown checkpoint.
1105 : */
1106 : void
1107 115 : StandbyReleaseAllLocks(void)
1108 : {
1109 : HASH_SEQ_STATUS status;
1110 : RecoveryLockXidEntry *entry;
1111 :
1112 115 : elog(DEBUG2, "release all standby locks");
1113 :
1114 115 : hash_seq_init(&status, RecoveryLockXidHash);
1115 115 : while ((entry = hash_seq_search(&status)))
1116 : {
1117 0 : StandbyReleaseXidEntryLocks(entry);
1118 0 : hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1119 : }
1120 115 : }
1121 :
1122 : /*
1123 : * StandbyReleaseOldLocks
1124 : * Release standby locks held by top-level XIDs that aren't running,
1125 : * as long as they're not prepared transactions.
1126 : *
1127 : * This is needed to prune the locks of crashed transactions, which didn't
1128 : * write an ABORT/COMMIT record.
1129 : */
1130 : void
1131 834 : StandbyReleaseOldLocks(TransactionId oldxid)
1132 : {
1133 : HASH_SEQ_STATUS status;
1134 : RecoveryLockXidEntry *entry;
1135 :
1136 834 : hash_seq_init(&status, RecoveryLockXidHash);
1137 1135 : while ((entry = hash_seq_search(&status)))
1138 : {
1139 : Assert(TransactionIdIsValid(entry->xid));
1140 :
1141 : /* Skip if prepared transaction. */
1142 301 : if (StandbyTransactionIdIsPrepared(entry->xid))
1143 0 : continue;
1144 :
1145 : /* Skip if >= oldxid. */
1146 301 : if (!TransactionIdPrecedes(entry->xid, oldxid))
1147 301 : continue;
1148 :
1149 : /* Remove all locks and hash table entry. */
1150 0 : StandbyReleaseXidEntryLocks(entry);
1151 0 : hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1152 : }
1153 834 : }
1154 :
1155 : /*
1156 : * --------------------------------------------------------------------
1157 : * Recovery handling for Rmgr RM_STANDBY_ID
1158 : *
1159 : * These record types will only be created if XLogStandbyInfoActive()
1160 : * --------------------------------------------------------------------
1161 : */
1162 :
1163 : void
1164 28241 : standby_redo(XLogReaderState *record)
1165 : {
1166 28241 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1167 :
1168 : /* Backup blocks are not used in standby records */
1169 : Assert(!XLogRecHasAnyBlockRefs(record));
1170 :
1171 : /* Do nothing if we're not in hot standby mode */
1172 28241 : if (standbyState == STANDBY_DISABLED)
1173 154 : return;
1174 :
1175 28087 : if (info == XLOG_STANDBY_LOCK)
1176 : {
1177 26643 : xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
1178 : int i;
1179 :
1180 54210 : for (i = 0; i < xlrec->nlocks; i++)
1181 27567 : StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
1182 : xlrec->locks[i].dbOid,
1183 : xlrec->locks[i].relOid);
1184 : }
1185 1444 : else if (info == XLOG_RUNNING_XACTS)
1186 : {
1187 768 : xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
1188 : RunningTransactionsData running;
1189 :
1190 768 : running.xcnt = xlrec->xcnt;
1191 768 : running.subxcnt = xlrec->subxcnt;
1192 768 : running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
1193 768 : running.nextXid = xlrec->nextXid;
1194 768 : running.latestCompletedXid = xlrec->latestCompletedXid;
1195 768 : running.oldestRunningXid = xlrec->oldestRunningXid;
1196 768 : running.xids = xlrec->xids;
1197 :
1198 768 : ProcArrayApplyRecoveryInfo(&running);
1199 :
1200 : /*
1201 : * The startup process currently has no convenient way to schedule
1202 : * stats to be reported. XLOG_RUNNING_XACTS records issued at a
1203 : * regular cadence, making this a convenient location to report stats.
1204 : * While these records aren't generated with wal_level=minimal, stats
1205 : * also cannot be accessed during WAL replay.
1206 : */
1207 768 : pgstat_report_stat(true);
1208 : }
1209 676 : else if (info == XLOG_INVALIDATIONS)
1210 : {
1211 676 : xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
1212 :
1213 676 : ProcessCommittedInvalidationMessages(xlrec->msgs,
1214 : xlrec->nmsgs,
1215 676 : xlrec->relcacheInitFileInval,
1216 : xlrec->dbId,
1217 : xlrec->tsId);
1218 : }
1219 : else
1220 0 : elog(PANIC, "standby_redo: unknown op code %u", info);
1221 : }
1222 :
1223 : /*
1224 : * Log details of the current snapshot to WAL. This allows the snapshot state
1225 : * to be reconstructed on the standby and for logical decoding.
1226 : *
1227 : * This is used for Hot Standby as follows:
1228 : *
1229 : * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
1230 : * start from a shutdown checkpoint because we know nothing was running
1231 : * at that time and our recovery snapshot is known empty. In the more
1232 : * typical case of an online checkpoint we need to jump through a few
1233 : * hoops to get a correct recovery snapshot and this requires a two or
1234 : * sometimes a three stage process.
1235 : *
1236 : * The initial snapshot must contain all running xids and all current
1237 : * AccessExclusiveLocks at a point in time on the standby. Assembling
1238 : * that information while the server is running requires many and
1239 : * various LWLocks, so we choose to derive that information piece by
1240 : * piece and then re-assemble that info on the standby. When that
1241 : * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
1242 : *
1243 : * Since locking on the primary when we derive the information is not
1244 : * strict, we note that there is a time window between the derivation and
1245 : * writing to WAL of the derived information. That allows race conditions
1246 : * that we must resolve, since xids and locks may enter or leave the
1247 : * snapshot during that window. This creates the issue that an xid or
1248 : * lock may start *after* the snapshot has been derived yet *before* the
1249 : * snapshot is logged in the running xacts WAL record. We resolve this by
1250 : * starting to accumulate changes at a point just prior to when we derive
1251 : * the snapshot on the primary, then ignore duplicates when we later apply
1252 : * the snapshot from the running xacts record. This is implemented during
1253 : * CreateCheckPoint() where we use the logical checkpoint location as
1254 : * our starting point and then write the running xacts record immediately
1255 : * before writing the main checkpoint WAL record. Since we always start
1256 : * up from a checkpoint and are immediately at our starting point, we
1257 : * unconditionally move to STANDBY_INITIALIZED. After this point we
1258 : * must do 4 things:
1259 : * * move shared nextXid forwards as we see new xids
1260 : * * extend the clog and subtrans with each new xid
1261 : * * keep track of uncommitted known assigned xids
1262 : * * keep track of uncommitted AccessExclusiveLocks
1263 : *
1264 : * When we see a commit/abort we must remove known assigned xids and locks
1265 : * from the completing transaction. Attempted removals that cannot locate
1266 : * an entry are expected and must not cause an error when we are in state
1267 : * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
1268 : * KnownAssignedXidsRemove().
1269 : *
1270 : * Later, when we apply the running xact data we must be careful to ignore
1271 : * transactions already committed, since those commits raced ahead when
1272 : * making WAL entries.
1273 : *
1274 : * For logical decoding only the running xacts information is needed;
1275 : * there's no need to look at the locking information, but it's logged anyway,
1276 : * as there's no independent knob to just enable logical decoding. For
1277 : * details of how this is used, check snapbuild.c's introductory comment.
1278 : *
1279 : *
1280 : * Returns the RecPtr of the last inserted record.
1281 : */
1282 : XLogRecPtr
1283 1461 : LogStandbySnapshot(void)
1284 : {
1285 : XLogRecPtr recptr;
1286 : RunningTransactions running;
1287 : xl_standby_lock *locks;
1288 : int nlocks;
1289 1461 : bool logical_decoding_enabled = IsLogicalDecodingEnabled();
1290 :
1291 : Assert(XLogStandbyInfoActive());
1292 :
1293 : #ifdef USE_INJECTION_POINTS
1294 1461 : if (IS_INJECTION_POINT_ATTACHED("skip-log-running-xacts"))
1295 : {
1296 : /*
1297 : * This record could move slot's xmin forward during decoding, leading
1298 : * to unpredictable results, so skip it when requested by the test.
1299 : */
1300 1 : return GetInsertRecPtr();
1301 : }
1302 : #endif
1303 :
1304 : /*
1305 : * Get details of any AccessExclusiveLocks being held at the moment.
1306 : */
1307 1460 : locks = GetRunningTransactionLocks(&nlocks);
1308 1460 : if (nlocks > 0)
1309 163 : LogAccessExclusiveLocks(nlocks, locks);
1310 1460 : pfree(locks);
1311 :
1312 : /*
1313 : * Log details of all in-progress transactions. This should be the last
1314 : * record we write, because standby will open up when it sees this.
1315 : */
1316 1460 : running = GetRunningTransactionData();
1317 :
1318 : /*
1319 : * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
1320 : * For Hot Standby this can be done before inserting the WAL record
1321 : * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
1322 : * the clog. For logical decoding, though, the lock can't be released
1323 : * early because the clog might be "in the future" from the POV of the
1324 : * historic snapshot. This would allow for situations where we're waiting
1325 : * for the end of a transaction listed in the xl_running_xacts record
1326 : * which, according to the WAL, has committed before the xl_running_xacts
1327 : * record. Fortunately this routine isn't executed frequently, and it's
1328 : * only a shared lock.
1329 : */
1330 1460 : if (!logical_decoding_enabled)
1331 910 : LWLockRelease(ProcArrayLock);
1332 :
1333 1460 : recptr = LogCurrentRunningXacts(running);
1334 :
1335 : /* Release lock if we kept it longer ... */
1336 1460 : if (logical_decoding_enabled)
1337 550 : LWLockRelease(ProcArrayLock);
1338 :
1339 : /* GetRunningTransactionData() acquired XidGenLock, we must release it */
1340 1460 : LWLockRelease(XidGenLock);
1341 :
1342 1460 : return recptr;
1343 : }
1344 :
1345 : /*
1346 : * Record an enhanced snapshot of running transactions into WAL.
1347 : *
1348 : * The definitions of RunningTransactionsData and xl_running_xacts are
1349 : * similar. We keep them separate because xl_running_xacts is a contiguous
1350 : * chunk of memory and never exists fully until it is assembled in WAL.
1351 : * The inserted records are marked as not being important for durability,
1352 : * to avoid triggering superfluous checkpoint / archiving activity.
1353 : */
1354 : static XLogRecPtr
1355 1460 : LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
1356 : {
1357 : xl_running_xacts xlrec;
1358 : XLogRecPtr recptr;
1359 :
1360 1460 : xlrec.xcnt = CurrRunningXacts->xcnt;
1361 1460 : xlrec.subxcnt = CurrRunningXacts->subxcnt;
1362 1460 : xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY);
1363 1460 : xlrec.nextXid = CurrRunningXacts->nextXid;
1364 1460 : xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
1365 1460 : xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
1366 :
1367 : /* Header */
1368 1460 : XLogBeginInsert();
1369 1460 : XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
1370 1460 : XLogRegisterData(&xlrec, MinSizeOfXactRunningXacts);
1371 :
1372 : /* array of TransactionIds */
1373 1460 : if (xlrec.xcnt > 0)
1374 475 : XLogRegisterData(CurrRunningXacts->xids,
1375 475 : (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
1376 :
1377 1460 : recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
1378 :
1379 1460 : if (xlrec.subxid_overflow)
1380 1 : elog(DEBUG2,
1381 : "snapshot of %d running transactions overflowed (lsn %X/%08X oldest xid %u latest complete %u next xid %u)",
1382 : CurrRunningXacts->xcnt,
1383 : LSN_FORMAT_ARGS(recptr),
1384 : CurrRunningXacts->oldestRunningXid,
1385 : CurrRunningXacts->latestCompletedXid,
1386 : CurrRunningXacts->nextXid);
1387 : else
1388 1459 : elog(DEBUG2,
1389 : "snapshot of %d+%d running transaction ids (lsn %X/%08X oldest xid %u latest complete %u next xid %u)",
1390 : CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
1391 : LSN_FORMAT_ARGS(recptr),
1392 : CurrRunningXacts->oldestRunningXid,
1393 : CurrRunningXacts->latestCompletedXid,
1394 : CurrRunningXacts->nextXid);
1395 :
1396 : /*
1397 : * Ensure running_xacts information is synced to disk not too far in the
1398 : * future. We don't want to stall anything though (i.e. use XLogFlush()),
1399 : * so we let the wal writer do it during normal operation.
1400 : * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
1401 : * and nudge the WALWriter into action if sleeping. Check
1402 : * XLogBackgroundFlush() for details why a record might not be flushed
1403 : * without it.
1404 : */
1405 1460 : XLogSetAsyncXactLSN(recptr);
1406 :
1407 1460 : return recptr;
1408 : }
1409 :
1410 : /*
1411 : * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
1412 : * logged, as described in backend/storage/lmgr/README.
1413 : */
1414 : static void
1415 136815 : LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
1416 : {
1417 : xl_standby_locks xlrec;
1418 :
1419 136815 : xlrec.nlocks = nlocks;
1420 :
1421 136815 : XLogBeginInsert();
1422 136815 : XLogRegisterData(&xlrec, offsetof(xl_standby_locks, locks));
1423 136815 : XLogRegisterData(locks, nlocks * sizeof(xl_standby_lock));
1424 136815 : XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
1425 :
1426 136815 : (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
1427 136815 : }
1428 :
1429 : /*
1430 : * Individual logging of AccessExclusiveLocks for use during LockAcquire()
1431 : */
1432 : void
1433 136652 : LogAccessExclusiveLock(Oid dbOid, Oid relOid)
1434 : {
1435 : xl_standby_lock xlrec;
1436 :
1437 136652 : xlrec.xid = GetCurrentTransactionId();
1438 :
1439 136652 : xlrec.dbOid = dbOid;
1440 136652 : xlrec.relOid = relOid;
1441 :
1442 136652 : LogAccessExclusiveLocks(1, &xlrec);
1443 136652 : MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
1444 136652 : }
1445 :
1446 : /*
1447 : * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
1448 : */
1449 : void
1450 136865 : LogAccessExclusiveLockPrepare(void)
1451 : {
1452 : /*
1453 : * Ensure that a TransactionId has been assigned to this transaction, for
1454 : * two reasons, both related to lock release on the standby. First, we
1455 : * must assign an xid so that RecordTransactionCommit() and
1456 : * RecordTransactionAbort() do not optimise away the transaction
1457 : * completion record which recovery relies upon to release locks. It's a
1458 : * hack, but for a corner case not worth adding code for into the main
1459 : * commit path. Second, we must assign an xid before the lock is recorded
1460 : * in shared memory, otherwise a concurrently executing
1461 : * GetRunningTransactionLocks() might see a lock associated with an
1462 : * InvalidTransactionId which we later assert cannot happen.
1463 : */
1464 136865 : (void) GetCurrentTransactionId();
1465 136865 : }
1466 :
1467 : /*
1468 : * Emit WAL for invalidations. This currently is only used for commits without
1469 : * an xid but which contain invalidations.
1470 : */
1471 : void
1472 10355 : LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
1473 : bool relcacheInitFileInval)
1474 : {
1475 : xl_invalidations xlrec;
1476 :
1477 : /* prepare record */
1478 10355 : memset(&xlrec, 0, sizeof(xlrec));
1479 10355 : xlrec.dbId = MyDatabaseId;
1480 10355 : xlrec.tsId = MyDatabaseTableSpace;
1481 10355 : xlrec.relcacheInitFileInval = relcacheInitFileInval;
1482 10355 : xlrec.nmsgs = nmsgs;
1483 :
1484 : /* perform insertion */
1485 10355 : XLogBeginInsert();
1486 10355 : XLogRegisterData(&xlrec, MinSizeOfInvalidations);
1487 10355 : XLogRegisterData(msgs,
1488 : nmsgs * sizeof(SharedInvalidationMessage));
1489 10355 : XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
1490 10355 : }
1491 :
1492 : /* Return the description of recovery conflict */
1493 : static const char *
1494 10 : get_recovery_conflict_desc(RecoveryConflictReason reason)
1495 : {
1496 10 : const char *reasonDesc = _("unknown reason");
1497 :
1498 10 : switch (reason)
1499 : {
1500 4 : case RECOVERY_CONFLICT_BUFFERPIN:
1501 4 : reasonDesc = _("recovery conflict on buffer pin");
1502 4 : break;
1503 2 : case RECOVERY_CONFLICT_LOCK:
1504 2 : reasonDesc = _("recovery conflict on lock");
1505 2 : break;
1506 2 : case RECOVERY_CONFLICT_TABLESPACE:
1507 2 : reasonDesc = _("recovery conflict on tablespace");
1508 2 : break;
1509 2 : case RECOVERY_CONFLICT_SNAPSHOT:
1510 2 : reasonDesc = _("recovery conflict on snapshot");
1511 2 : break;
1512 0 : case RECOVERY_CONFLICT_LOGICALSLOT:
1513 0 : reasonDesc = _("recovery conflict on replication slot");
1514 0 : break;
1515 0 : case RECOVERY_CONFLICT_STARTUP_DEADLOCK:
1516 0 : reasonDesc = _("recovery conflict on deadlock");
1517 0 : break;
1518 0 : case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK:
1519 0 : reasonDesc = _("recovery conflict on buffer deadlock");
1520 0 : break;
1521 0 : case RECOVERY_CONFLICT_DATABASE:
1522 0 : reasonDesc = _("recovery conflict on database");
1523 0 : break;
1524 : }
1525 :
1526 10 : return reasonDesc;
1527 : }
|