LCOV - code coverage report
Current view: top level - src/backend/storage/ipc - standby.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 339 379 89.4 %
Date: 2025-04-24 12:15:10 Functions: 30 31 96.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * standby.c
       4             :  *    Misc functions used in Hot Standby mode.
       5             :  *
       6             :  *  All functions for handling RM_STANDBY_ID, which relate to
       7             :  *  AccessExclusiveLocks and starting snapshots for Hot Standby mode.
       8             :  *  Plus conflict recovery processing.
       9             :  *
      10             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      11             :  * Portions Copyright (c) 1994, Regents of the University of California
      12             :  *
      13             :  * IDENTIFICATION
      14             :  *    src/backend/storage/ipc/standby.c
      15             :  *
      16             :  *-------------------------------------------------------------------------
      17             :  */
      18             : #include "postgres.h"
      19             : #include "access/transam.h"
      20             : #include "access/twophase.h"
      21             : #include "access/xact.h"
      22             : #include "access/xloginsert.h"
      23             : #include "access/xlogrecovery.h"
      24             : #include "access/xlogutils.h"
      25             : #include "miscadmin.h"
      26             : #include "pgstat.h"
      27             : #include "replication/slot.h"
      28             : #include "storage/bufmgr.h"
      29             : #include "storage/proc.h"
      30             : #include "storage/procarray.h"
      31             : #include "storage/sinvaladt.h"
      32             : #include "storage/standby.h"
      33             : #include "utils/hsearch.h"
      34             : #include "utils/injection_point.h"
      35             : #include "utils/ps_status.h"
      36             : #include "utils/timeout.h"
      37             : #include "utils/timestamp.h"
      38             : 
      39             : /* User-settable GUC parameters */
      40             : int         max_standby_archive_delay = 30 * 1000;
      41             : int         max_standby_streaming_delay = 30 * 1000;
      42             : bool        log_recovery_conflict_waits = false;
      43             : 
      44             : /*
      45             :  * Keep track of all the exclusive locks owned by original transactions.
      46             :  * For each known exclusive lock, there is a RecoveryLockEntry in the
      47             :  * RecoveryLockHash hash table.  All RecoveryLockEntrys belonging to a
      48             :  * given XID are chained together so that we can find them easily.
      49             :  * For each original transaction that is known to have any such locks,
      50             :  * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
      51             :  * which stores the head of the chain of its locks.
      52             :  */
      53             : typedef struct RecoveryLockEntry
      54             : {
      55             :     xl_standby_lock key;        /* hash key: xid, dbOid, relOid */
      56             :     struct RecoveryLockEntry *next; /* chain link */
      57             : } RecoveryLockEntry;
      58             : 
      59             : typedef struct RecoveryLockXidEntry
      60             : {
      61             :     TransactionId xid;          /* hash key -- must be first */
      62             :     struct RecoveryLockEntry *head; /* chain head */
      63             : } RecoveryLockXidEntry;
      64             : 
      65             : static HTAB *RecoveryLockHash = NULL;
      66             : static HTAB *RecoveryLockXidHash = NULL;
      67             : 
      68             : /* Flags set by timeout handlers */
      69             : static volatile sig_atomic_t got_standby_deadlock_timeout = false;
      70             : static volatile sig_atomic_t got_standby_delay_timeout = false;
      71             : static volatile sig_atomic_t got_standby_lock_timeout = false;
      72             : 
      73             : static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
      74             :                                                    ProcSignalReason reason,
      75             :                                                    uint32 wait_event_info,
      76             :                                                    bool report_waiting);
      77             : static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
      78             : static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
      79             : static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
      80             : static const char *get_recovery_conflict_desc(ProcSignalReason reason);
      81             : 
      82             : /*
      83             :  * InitRecoveryTransactionEnvironment
      84             :  *      Initialize tracking of our primary's in-progress transactions.
      85             :  *
      86             :  * We need to issue shared invalidations and hold locks. Holding locks
      87             :  * means others may want to wait on us, so we need to make a lock table
      88             :  * vxact entry like a real transaction. We could create and delete
      89             :  * lock table entries for each transaction but its simpler just to create
      90             :  * one permanent entry and leave it there all the time. Locks are then
      91             :  * acquired and released as needed. Yes, this means you can see the
      92             :  * Startup process in pg_locks once we have run this.
      93             :  */
      94             : void
      95         208 : InitRecoveryTransactionEnvironment(void)
      96             : {
      97             :     VirtualTransactionId vxid;
      98             :     HASHCTL     hash_ctl;
      99             : 
     100             :     Assert(RecoveryLockHash == NULL);   /* don't run this twice */
     101             : 
     102             :     /*
     103             :      * Initialize the hash tables for tracking the locks held by each
     104             :      * transaction.
     105             :      */
     106         208 :     hash_ctl.keysize = sizeof(xl_standby_lock);
     107         208 :     hash_ctl.entrysize = sizeof(RecoveryLockEntry);
     108         208 :     RecoveryLockHash = hash_create("RecoveryLockHash",
     109             :                                    64,
     110             :                                    &hash_ctl,
     111             :                                    HASH_ELEM | HASH_BLOBS);
     112         208 :     hash_ctl.keysize = sizeof(TransactionId);
     113         208 :     hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
     114         208 :     RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
     115             :                                       64,
     116             :                                       &hash_ctl,
     117             :                                       HASH_ELEM | HASH_BLOBS);
     118             : 
     119             :     /*
     120             :      * Initialize shared invalidation management for Startup process, being
     121             :      * careful to register ourselves as a sendOnly process so we don't need to
     122             :      * read messages, nor will we get signaled when the queue starts filling
     123             :      * up.
     124             :      */
     125         208 :     SharedInvalBackendInit(true);
     126             : 
     127             :     /*
     128             :      * Lock a virtual transaction id for Startup process.
     129             :      *
     130             :      * We need to do GetNextLocalTransactionId() because
     131             :      * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
     132             :      * manager doesn't like that at all.
     133             :      *
     134             :      * Note that we don't need to run XactLockTableInsert() because nobody
     135             :      * needs to wait on xids. That sounds a little strange, but table locks
     136             :      * are held by vxids and row level locks are held by xids. All queries
     137             :      * hold AccessShareLocks so never block while we write or lock new rows.
     138             :      */
     139         208 :     MyProc->vxid.procNumber = MyProcNumber;
     140         208 :     vxid.procNumber = MyProcNumber;
     141         208 :     vxid.localTransactionId = GetNextLocalTransactionId();
     142         208 :     VirtualXactLockTableInsert(vxid);
     143             : 
     144         208 :     standbyState = STANDBY_INITIALIZED;
     145         208 : }
     146             : 
     147             : /*
     148             :  * ShutdownRecoveryTransactionEnvironment
     149             :  *      Shut down transaction tracking
     150             :  *
     151             :  * Prepare to switch from hot standby mode to normal operation. Shut down
     152             :  * recovery-time transaction tracking.
     153             :  *
     154             :  * This must be called even in shutdown of startup process if transaction
     155             :  * tracking has been initialized. Otherwise some locks the tracked
     156             :  * transactions were holding will not be released and may interfere with
     157             :  * the processes still running (but will exit soon later) at the exit of
     158             :  * startup process.
     159             :  */
     160             : void
     161         306 : ShutdownRecoveryTransactionEnvironment(void)
     162             : {
     163             :     /*
     164             :      * Do nothing if RecoveryLockHash is NULL because that means that
     165             :      * transaction tracking has not yet been initialized or has already been
     166             :      * shut down.  This makes it safe to have possibly-redundant calls of this
     167             :      * function during process exit.
     168             :      */
     169         306 :     if (RecoveryLockHash == NULL)
     170          98 :         return;
     171             : 
     172             :     /* Mark all tracked in-progress transactions as finished. */
     173         208 :     ExpireAllKnownAssignedTransactionIds();
     174             : 
     175             :     /* Release all locks the tracked transactions were holding */
     176         208 :     StandbyReleaseAllLocks();
     177             : 
     178             :     /* Destroy the lock hash tables. */
     179         208 :     hash_destroy(RecoveryLockHash);
     180         208 :     hash_destroy(RecoveryLockXidHash);
     181         208 :     RecoveryLockHash = NULL;
     182         208 :     RecoveryLockXidHash = NULL;
     183             : 
     184             :     /* Cleanup our VirtualTransaction */
     185         208 :     VirtualXactLockTableCleanup();
     186             : }
     187             : 
     188             : 
     189             : /*
     190             :  * -----------------------------------------------------
     191             :  *      Standby wait timers and backend cancel logic
     192             :  * -----------------------------------------------------
     193             :  */
     194             : 
     195             : /*
     196             :  * Determine the cutoff time at which we want to start canceling conflicting
     197             :  * transactions.  Returns zero (a time safely in the past) if we are willing
     198             :  * to wait forever.
     199             :  */
     200             : static TimestampTz
     201          58 : GetStandbyLimitTime(void)
     202             : {
     203             :     TimestampTz rtime;
     204             :     bool        fromStream;
     205             : 
     206             :     /*
     207             :      * The cutoff time is the last WAL data receipt time plus the appropriate
     208             :      * delay variable.  Delay of -1 means wait forever.
     209             :      */
     210          58 :     GetXLogReceiptTime(&rtime, &fromStream);
     211          58 :     if (fromStream)
     212             :     {
     213          58 :         if (max_standby_streaming_delay < 0)
     214           0 :             return 0;           /* wait forever */
     215          58 :         return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
     216             :     }
     217             :     else
     218             :     {
     219           0 :         if (max_standby_archive_delay < 0)
     220           0 :             return 0;           /* wait forever */
     221           0 :         return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
     222             :     }
     223             : }
     224             : 
     225             : #define STANDBY_INITIAL_WAIT_US  1000
     226             : static int  standbyWait_us = STANDBY_INITIAL_WAIT_US;
     227             : 
     228             : /*
     229             :  * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
     230             :  * We wait here for a while then return. If we decide we can't wait any
     231             :  * more then we return true, if we can wait some more return false.
     232             :  */
     233             : static bool
     234          30 : WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
     235             : {
     236             :     TimestampTz ltime;
     237             : 
     238          30 :     CHECK_FOR_INTERRUPTS();
     239             : 
     240             :     /* Are we past the limit time? */
     241          30 :     ltime = GetStandbyLimitTime();
     242          30 :     if (ltime && GetCurrentTimestamp() >= ltime)
     243           6 :         return true;
     244             : 
     245             :     /*
     246             :      * Sleep a bit (this is essential to avoid busy-waiting).
     247             :      */
     248          24 :     pgstat_report_wait_start(wait_event_info);
     249          24 :     pg_usleep(standbyWait_us);
     250          24 :     pgstat_report_wait_end();
     251             : 
     252             :     /*
     253             :      * Progressively increase the sleep times, but not to more than 1s, since
     254             :      * pg_usleep isn't interruptible on some platforms.
     255             :      */
     256          24 :     standbyWait_us *= 2;
     257          24 :     if (standbyWait_us > 1000000)
     258           0 :         standbyWait_us = 1000000;
     259             : 
     260          24 :     return false;
     261             : }
     262             : 
     263             : /*
     264             :  * Log the recovery conflict.
     265             :  *
     266             :  * wait_start is the timestamp when the caller started to wait.
     267             :  * now is the timestamp when this function has been called.
     268             :  * wait_list is the list of virtual transaction ids assigned to
     269             :  * conflicting processes. still_waiting indicates whether
     270             :  * the startup process is still waiting for the recovery conflict
     271             :  * to be resolved or not.
     272             :  */
     273             : void
     274          20 : LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
     275             :                     TimestampTz now, VirtualTransactionId *wait_list,
     276             :                     bool still_waiting)
     277             : {
     278             :     long        secs;
     279             :     int         usecs;
     280             :     long        msecs;
     281             :     StringInfoData buf;
     282          20 :     int         nprocs = 0;
     283             : 
     284             :     /*
     285             :      * There must be no conflicting processes when the recovery conflict has
     286             :      * already been resolved.
     287             :      */
     288             :     Assert(still_waiting || wait_list == NULL);
     289             : 
     290          20 :     TimestampDifference(wait_start, now, &secs, &usecs);
     291          20 :     msecs = secs * 1000 + usecs / 1000;
     292          20 :     usecs = usecs % 1000;
     293             : 
     294          20 :     if (wait_list)
     295             :     {
     296             :         VirtualTransactionId *vxids;
     297             : 
     298             :         /* Construct a string of list of the conflicting processes */
     299           6 :         vxids = wait_list;
     300          12 :         while (VirtualTransactionIdIsValid(*vxids))
     301             :         {
     302           6 :             PGPROC     *proc = ProcNumberGetProc(vxids->procNumber);
     303             : 
     304             :             /* proc can be NULL if the target backend is not active */
     305           6 :             if (proc)
     306             :             {
     307           6 :                 if (nprocs == 0)
     308             :                 {
     309           6 :                     initStringInfo(&buf);
     310           6 :                     appendStringInfo(&buf, "%d", proc->pid);
     311             :                 }
     312             :                 else
     313           0 :                     appendStringInfo(&buf, ", %d", proc->pid);
     314             : 
     315           6 :                 nprocs++;
     316             :             }
     317             : 
     318           6 :             vxids++;
     319             :         }
     320             :     }
     321             : 
     322             :     /*
     323             :      * If wait_list is specified, report the list of PIDs of active
     324             :      * conflicting backends in a detail message. Note that if all the backends
     325             :      * in the list are not active, no detail message is logged.
     326             :      */
     327          20 :     if (still_waiting)
     328             :     {
     329          10 :         ereport(LOG,
     330             :                 errmsg("recovery still waiting after %ld.%03d ms: %s",
     331             :                        msecs, usecs, get_recovery_conflict_desc(reason)),
     332             :                 nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
     333             :                                                   "Conflicting processes: %s.",
     334             :                                                   nprocs, buf.data) : 0);
     335             :     }
     336             :     else
     337             :     {
     338          10 :         ereport(LOG,
     339             :                 errmsg("recovery finished waiting after %ld.%03d ms: %s",
     340             :                        msecs, usecs, get_recovery_conflict_desc(reason)));
     341             :     }
     342             : 
     343          20 :     if (nprocs > 0)
     344           6 :         pfree(buf.data);
     345          20 : }
     346             : 
     347             : /*
     348             :  * This is the main executioner for any query backend that conflicts with
     349             :  * recovery processing. Judgement has already been passed on it within
     350             :  * a specific rmgr. Here we just issue the orders to the procs. The procs
     351             :  * then throw the required error as instructed.
     352             :  *
     353             :  * If report_waiting is true, "waiting" is reported in PS display and the
     354             :  * wait for recovery conflict is reported in the log, if necessary. If
     355             :  * the caller is responsible for reporting them, report_waiting should be
     356             :  * false. Otherwise, both the caller and this function report the same
     357             :  * thing unexpectedly.
     358             :  */
     359             : static void
     360       21568 : ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
     361             :                                        ProcSignalReason reason, uint32 wait_event_info,
     362             :                                        bool report_waiting)
     363             : {
     364       21568 :     TimestampTz waitStart = 0;
     365       21568 :     bool        waiting = false;
     366       21568 :     bool        logged_recovery_conflict = false;
     367             : 
     368             :     /* Fast exit, to avoid a kernel call if there's no work to be done. */
     369       21568 :     if (!VirtualTransactionIdIsValid(*waitlist))
     370       21562 :         return;
     371             : 
     372             :     /* Set the wait start timestamp for reporting */
     373           6 :     if (report_waiting && (log_recovery_conflict_waits || update_process_title))
     374           4 :         waitStart = GetCurrentTimestamp();
     375             : 
     376          12 :     while (VirtualTransactionIdIsValid(*waitlist))
     377             :     {
     378             :         /* reset standbyWait_us for each xact we wait for */
     379           6 :         standbyWait_us = STANDBY_INITIAL_WAIT_US;
     380             : 
     381             :         /* wait until the virtual xid is gone */
     382          36 :         while (!VirtualXactLock(*waitlist, false))
     383             :         {
     384             :             /* Is it time to kill it? */
     385          30 :             if (WaitExceedsMaxStandbyDelay(wait_event_info))
     386             :             {
     387             :                 pid_t       pid;
     388             : 
     389             :                 /*
     390             :                  * Now find out who to throw out of the balloon.
     391             :                  */
     392             :                 Assert(VirtualTransactionIdIsValid(*waitlist));
     393           6 :                 pid = CancelVirtualTransaction(*waitlist, reason);
     394             : 
     395             :                 /*
     396             :                  * Wait a little bit for it to die so that we avoid flooding
     397             :                  * an unresponsive backend when system is heavily loaded.
     398             :                  */
     399           6 :                 if (pid != 0)
     400           6 :                     pg_usleep(5000L);
     401             :             }
     402             : 
     403          30 :             if (waitStart != 0 && (!logged_recovery_conflict || !waiting))
     404             :             {
     405          28 :                 TimestampTz now = 0;
     406             :                 bool        maybe_log_conflict;
     407             :                 bool        maybe_update_title;
     408             : 
     409          28 :                 maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
     410          28 :                 maybe_update_title = (update_process_title && !waiting);
     411             : 
     412             :                 /* Get the current timestamp if not report yet */
     413          28 :                 if (maybe_log_conflict || maybe_update_title)
     414          28 :                     now = GetCurrentTimestamp();
     415             : 
     416             :                 /*
     417             :                  * Report via ps if we have been waiting for more than 500
     418             :                  * msec (should that be configurable?)
     419             :                  */
     420          56 :                 if (maybe_update_title &&
     421          28 :                     TimestampDifferenceExceeds(waitStart, now, 500))
     422             :                 {
     423           0 :                     set_ps_display_suffix("waiting");
     424           0 :                     waiting = true;
     425             :                 }
     426             : 
     427             :                 /*
     428             :                  * Emit the log message if the startup process is waiting
     429             :                  * longer than deadlock_timeout for recovery conflict.
     430             :                  */
     431          44 :                 if (maybe_log_conflict &&
     432          16 :                     TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
     433             :                 {
     434           4 :                     LogRecoveryConflict(reason, waitStart, now, waitlist, true);
     435           4 :                     logged_recovery_conflict = true;
     436             :                 }
     437             :             }
     438             :         }
     439             : 
     440             :         /* The virtual transaction is gone now, wait for the next one */
     441           6 :         waitlist++;
     442             :     }
     443             : 
     444             :     /*
     445             :      * Emit the log message if recovery conflict was resolved but the startup
     446             :      * process waited longer than deadlock_timeout for it.
     447             :      */
     448           6 :     if (logged_recovery_conflict)
     449           4 :         LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
     450             :                             NULL, false);
     451             : 
     452             :     /* reset ps display to remove the suffix if we added one */
     453           6 :     if (waiting)
     454           0 :         set_ps_display_remove_suffix();
     455             : 
     456             : }
     457             : 
     458             : /*
     459             :  * Generate whatever recovery conflicts are needed to eliminate snapshots that
     460             :  * might see XIDs <= snapshotConflictHorizon as still running.
     461             :  *
     462             :  * snapshotConflictHorizon cutoffs are our standard approach to generating
     463             :  * granular recovery conflicts.  Note that InvalidTransactionId values are
     464             :  * interpreted as "definitely don't need any conflicts" here, which is a
     465             :  * general convention that WAL records can (and often do) depend on.
     466             :  */
     467             : void
     468       28568 : ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
     469             :                                     bool isCatalogRel,
     470             :                                     RelFileLocator locator)
     471             : {
     472             :     VirtualTransactionId *backends;
     473             : 
     474             :     /*
     475             :      * If we get passed InvalidTransactionId then we do nothing (no conflict).
     476             :      *
     477             :      * This can happen when replaying already-applied WAL records after a
     478             :      * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
     479             :      * record that marks as frozen a page which was already all-visible.  It's
     480             :      * also quite common with records generated during index deletion
     481             :      * (original execution of the deletion can reason that a recovery conflict
     482             :      * which is sufficient for the deletion operation must take place before
     483             :      * replay of the deletion record itself).
     484             :      */
     485       28568 :     if (!TransactionIdIsValid(snapshotConflictHorizon))
     486        7004 :         return;
     487             : 
     488             :     Assert(TransactionIdIsNormal(snapshotConflictHorizon));
     489       21564 :     backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
     490             :                                          locator.dbOid);
     491       21564 :     ResolveRecoveryConflictWithVirtualXIDs(backends,
     492             :                                            PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
     493             :                                            WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
     494             :                                            true);
     495             : 
     496             :     /*
     497             :      * Note that WaitExceedsMaxStandbyDelay() is not taken into account here
     498             :      * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That
     499             :      * seems OK, given that this kind of conflict should not normally be
     500             :      * reached, e.g. due to using a physical replication slot.
     501             :      */
     502       21564 :     if (wal_level >= WAL_LEVEL_LOGICAL && isCatalogRel)
     503          36 :         InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid,
     504             :                                            snapshotConflictHorizon);
     505             : }
     506             : 
     507             : /*
     508             :  * Variant of ResolveRecoveryConflictWithSnapshot that works with
     509             :  * FullTransactionId values
     510             :  */
     511             : void
     512           0 : ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
     513             :                                            bool isCatalogRel,
     514             :                                            RelFileLocator locator)
     515             : {
     516             :     /*
     517             :      * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
     518             :      * so truncate the logged FullTransactionId.  If the logged value is very
     519             :      * old, so that XID wrap-around already happened on it, there can't be any
     520             :      * snapshots that still see it.
     521             :      */
     522           0 :     FullTransactionId nextXid = ReadNextFullTransactionId();
     523             :     uint64      diff;
     524             : 
     525           0 :     diff = U64FromFullTransactionId(nextXid) -
     526           0 :         U64FromFullTransactionId(snapshotConflictHorizon);
     527           0 :     if (diff < MaxTransactionId / 2)
     528             :     {
     529             :         TransactionId truncated;
     530             : 
     531           0 :         truncated = XidFromFullTransactionId(snapshotConflictHorizon);
     532           0 :         ResolveRecoveryConflictWithSnapshot(truncated,
     533             :                                             isCatalogRel,
     534             :                                             locator);
     535             :     }
     536           0 : }
     537             : 
     538             : void
     539           2 : ResolveRecoveryConflictWithTablespace(Oid tsid)
     540             : {
     541             :     VirtualTransactionId *temp_file_users;
     542             : 
     543             :     /*
     544             :      * Standby users may be currently using this tablespace for their
     545             :      * temporary files. We only care about current users because
     546             :      * temp_tablespace parameter will just ignore tablespaces that no longer
     547             :      * exist.
     548             :      *
     549             :      * Ask everybody to cancel their queries immediately so we can ensure no
     550             :      * temp files remain and we can remove the tablespace. Nuke the entire
     551             :      * site from orbit, it's the only way to be sure.
     552             :      *
     553             :      * XXX: We could work out the pids of active backends using this
     554             :      * tablespace by examining the temp filenames in the directory. We would
     555             :      * then convert the pids into VirtualXIDs before attempting to cancel
     556             :      * them.
     557             :      *
     558             :      * We don't wait for commit because drop tablespace is non-transactional.
     559             :      */
     560           2 :     temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
     561             :                                                 InvalidOid);
     562           2 :     ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
     563             :                                            PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
     564             :                                            WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
     565             :                                            true);
     566           2 : }
     567             : 
     568             : void
     569          26 : ResolveRecoveryConflictWithDatabase(Oid dbid)
     570             : {
     571             :     /*
     572             :      * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
     573             :      * only waits for transactions and completely idle sessions would block
     574             :      * us. This is rare enough that we do this as simply as possible: no wait,
     575             :      * just force them off immediately.
     576             :      *
     577             :      * No locking is required here because we already acquired
     578             :      * AccessExclusiveLock. Anybody trying to connect while we do this will
     579             :      * block during InitPostgres() and then disconnect when they see the
     580             :      * database has been removed.
     581             :      */
     582          30 :     while (CountDBBackends(dbid) > 0)
     583             :     {
     584           4 :         CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
     585             : 
     586             :         /*
     587             :          * Wait awhile for them to die so that we avoid flooding an
     588             :          * unresponsive backend when system is heavily loaded.
     589             :          */
     590           4 :         pg_usleep(10000);
     591             :     }
     592          26 : }
     593             : 
     594             : /*
     595             :  * ResolveRecoveryConflictWithLock is called from ProcSleep()
     596             :  * to resolve conflicts with other backends holding relation locks.
     597             :  *
     598             :  * The WaitLatch sleep normally done in ProcSleep()
     599             :  * (when not InHotStandby) is performed here, for code clarity.
     600             :  *
     601             :  * We either resolve conflicts immediately or set a timeout to wake us at
     602             :  * the limit of our patience.
     603             :  *
     604             :  * Resolve conflicts by canceling to all backends holding a conflicting
     605             :  * lock.  As we are already queued to be granted the lock, no new lock
     606             :  * requests conflicting with ours will be granted in the meantime.
     607             :  *
     608             :  * We also must check for deadlocks involving the Startup process and
     609             :  * hot-standby backend processes. If deadlock_timeout is reached in
     610             :  * this function, all the backends holding the conflicting locks are
     611             :  * requested to check themselves for deadlocks.
     612             :  *
     613             :  * logging_conflict should be true if the recovery conflict has not been
     614             :  * logged yet even though logging is enabled. After deadlock_timeout is
     615             :  * reached and the request for deadlock check is sent, we wait again to
     616             :  * be signaled by the release of the lock if logging_conflict is false.
     617             :  * Otherwise we return without waiting again so that the caller can report
     618             :  * the recovery conflict. In this case, then, this function is called again
     619             :  * with logging_conflict=false (because the recovery conflict has already
     620             :  * been logged) and we will wait again for the lock to be released.
     621             :  */
     622             : void
     623           8 : ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
     624             : {
     625             :     TimestampTz ltime;
     626             :     TimestampTz now;
     627             : 
     628             :     Assert(InHotStandby);
     629             : 
     630           8 :     ltime = GetStandbyLimitTime();
     631           8 :     now = GetCurrentTimestamp();
     632             : 
     633             :     /*
     634             :      * Update waitStart if first time through after the startup process
     635             :      * started waiting for the lock. It should not be updated every time
     636             :      * ResolveRecoveryConflictWithLock() is called during the wait.
     637             :      *
     638             :      * Use the current time obtained for comparison with ltime as waitStart
     639             :      * (i.e., the time when this process started waiting for the lock). Since
     640             :      * getting the current time newly can cause overhead, we reuse the
     641             :      * already-obtained time to avoid that overhead.
     642             :      *
     643             :      * Note that waitStart is updated without holding the lock table's
     644             :      * partition lock, to avoid the overhead by additional lock acquisition.
     645             :      * This can cause "waitstart" in pg_locks to become NULL for a very short
     646             :      * period of time after the wait started even though "granted" is false.
     647             :      * This is OK in practice because we can assume that users are likely to
     648             :      * look at "waitstart" when waiting for the lock for a long time.
     649             :      */
     650           8 :     if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
     651           2 :         pg_atomic_write_u64(&MyProc->waitStart, now);
     652             : 
     653           8 :     if (now >= ltime && ltime != 0)
     654           2 :     {
     655             :         /*
     656             :          * We're already behind, so clear a path as quickly as possible.
     657             :          */
     658             :         VirtualTransactionId *backends;
     659             : 
     660           2 :         backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
     661             : 
     662             :         /*
     663             :          * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
     664             :          * "waiting" in PS display by disabling its argument report_waiting
     665             :          * because the caller, WaitOnLock(), has already reported that.
     666             :          */
     667           2 :         ResolveRecoveryConflictWithVirtualXIDs(backends,
     668             :                                                PROCSIG_RECOVERY_CONFLICT_LOCK,
     669           2 :                                                PG_WAIT_LOCK | locktag.locktag_type,
     670             :                                                false);
     671             :     }
     672             :     else
     673             :     {
     674             :         /*
     675             :          * Wait (or wait again) until ltime, and check for deadlocks as well
     676             :          * if we will be waiting longer than deadlock_timeout
     677             :          */
     678             :         EnableTimeoutParams timeouts[2];
     679           6 :         int         cnt = 0;
     680             : 
     681           6 :         if (ltime != 0)
     682             :         {
     683           6 :             got_standby_lock_timeout = false;
     684           6 :             timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
     685           6 :             timeouts[cnt].type = TMPARAM_AT;
     686           6 :             timeouts[cnt].fin_time = ltime;
     687           6 :             cnt++;
     688             :         }
     689             : 
     690           6 :         got_standby_deadlock_timeout = false;
     691           6 :         timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
     692           6 :         timeouts[cnt].type = TMPARAM_AFTER;
     693           6 :         timeouts[cnt].delay_ms = DeadlockTimeout;
     694           6 :         cnt++;
     695             : 
     696           6 :         enable_timeouts(timeouts, cnt);
     697             :     }
     698             : 
     699             :     /* Wait to be signaled by the release of the Relation Lock */
     700           8 :     ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
     701             : 
     702             :     /*
     703             :      * Exit if ltime is reached. Then all the backends holding conflicting
     704             :      * locks will be canceled in the next ResolveRecoveryConflictWithLock()
     705             :      * call.
     706             :      */
     707           8 :     if (got_standby_lock_timeout)
     708           0 :         goto cleanup;
     709             : 
     710           8 :     if (got_standby_deadlock_timeout)
     711             :     {
     712             :         VirtualTransactionId *backends;
     713             : 
     714           4 :         backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
     715             : 
     716             :         /* Quick exit if there's no work to be done */
     717           4 :         if (!VirtualTransactionIdIsValid(*backends))
     718           0 :             goto cleanup;
     719             : 
     720             :         /*
     721             :          * Send signals to all the backends holding the conflicting locks, to
     722             :          * ask them to check themselves for deadlocks.
     723             :          */
     724           8 :         while (VirtualTransactionIdIsValid(*backends))
     725             :         {
     726           4 :             SignalVirtualTransaction(*backends,
     727             :                                      PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
     728             :                                      false);
     729           4 :             backends++;
     730             :         }
     731             : 
     732             :         /*
     733             :          * Exit if the recovery conflict has not been logged yet even though
     734             :          * logging is enabled, so that the caller can log that. Then
     735             :          * RecoveryConflictWithLock() is called again and we will wait again
     736             :          * for the lock to be released.
     737             :          */
     738           4 :         if (logging_conflict)
     739           2 :             goto cleanup;
     740             : 
     741             :         /*
     742             :          * Wait again here to be signaled by the release of the Relation Lock,
     743             :          * to prevent the subsequent RecoveryConflictWithLock() from causing
     744             :          * deadlock_timeout and sending a request for deadlocks check again.
     745             :          * Otherwise the request continues to be sent every deadlock_timeout
     746             :          * until the relation locks are released or ltime is reached.
     747             :          */
     748           2 :         got_standby_deadlock_timeout = false;
     749           2 :         ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
     750             :     }
     751             : 
     752           4 : cleanup:
     753             : 
     754             :     /*
     755             :      * Clear any timeout requests established above.  We assume here that the
     756             :      * Startup process doesn't have any other outstanding timeouts than those
     757             :      * used by this function. If that stops being true, we could cancel the
     758             :      * timeouts individually, but that'd be slower.
     759             :      */
     760           8 :     disable_all_timeouts(false);
     761           8 :     got_standby_lock_timeout = false;
     762           8 :     got_standby_deadlock_timeout = false;
     763           8 : }
     764             : 
     765             : /*
     766             :  * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
     767             :  * to resolve conflicts with other backends holding buffer pins.
     768             :  *
     769             :  * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
     770             :  * (when not InHotStandby) is performed here, for code clarity.
     771             :  *
     772             :  * We either resolve conflicts immediately or set a timeout to wake us at
     773             :  * the limit of our patience.
     774             :  *
     775             :  * Resolve conflicts by sending a PROCSIG signal to all backends to check if
     776             :  * they hold one of the buffer pins that is blocking Startup process. If so,
     777             :  * those backends will take an appropriate error action, ERROR or FATAL.
     778             :  *
     779             :  * We also must check for deadlocks.  Deadlocks occur because if queries
     780             :  * wait on a lock, that must be behind an AccessExclusiveLock, which can only
     781             :  * be cleared if the Startup process replays a transaction completion record.
     782             :  * If Startup process is also waiting then that is a deadlock. The deadlock
     783             :  * can occur if the query is waiting and then the Startup sleeps, or if
     784             :  * Startup is sleeping and the query waits on a lock. We protect against
     785             :  * only the former sequence here, the latter sequence is checked prior to
     786             :  * the query sleeping, in CheckRecoveryConflictDeadlock().
     787             :  *
     788             :  * Deadlocks are extremely rare, and relatively expensive to check for,
     789             :  * so we don't do a deadlock check right away ... only if we have had to wait
     790             :  * at least deadlock_timeout.
     791             :  */
     792             : void
     793          20 : ResolveRecoveryConflictWithBufferPin(void)
     794             : {
     795             :     TimestampTz ltime;
     796             : 
     797             :     Assert(InHotStandby);
     798             : 
     799          20 :     ltime = GetStandbyLimitTime();
     800             : 
     801          20 :     if (GetCurrentTimestamp() >= ltime && ltime != 0)
     802             :     {
     803             :         /*
     804             :          * We're already behind, so clear a path as quickly as possible.
     805             :          */
     806           2 :         SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
     807             :     }
     808             :     else
     809             :     {
     810             :         /*
     811             :          * Wake up at ltime, and check for deadlocks as well if we will be
     812             :          * waiting longer than deadlock_timeout
     813             :          */
     814             :         EnableTimeoutParams timeouts[2];
     815          18 :         int         cnt = 0;
     816             : 
     817          18 :         if (ltime != 0)
     818             :         {
     819          18 :             timeouts[cnt].id = STANDBY_TIMEOUT;
     820          18 :             timeouts[cnt].type = TMPARAM_AT;
     821          18 :             timeouts[cnt].fin_time = ltime;
     822          18 :             cnt++;
     823             :         }
     824             : 
     825          18 :         got_standby_deadlock_timeout = false;
     826          18 :         timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
     827          18 :         timeouts[cnt].type = TMPARAM_AFTER;
     828          18 :         timeouts[cnt].delay_ms = DeadlockTimeout;
     829          18 :         cnt++;
     830             : 
     831          18 :         enable_timeouts(timeouts, cnt);
     832             :     }
     833             : 
     834             :     /*
     835             :      * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
     836             :      * by one of the timeouts established above.
     837             :      *
     838             :      * We assume that only UnpinBuffer() and the timeout requests established
     839             :      * above can wake us up here. WakeupRecovery() called by walreceiver or
     840             :      * SIGHUP signal handler, etc cannot do that because it uses the different
     841             :      * latch from that ProcWaitForSignal() waits on.
     842             :      */
     843          20 :     ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
     844             : 
     845          20 :     if (got_standby_delay_timeout)
     846           2 :         SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
     847          18 :     else if (got_standby_deadlock_timeout)
     848             :     {
     849             :         /*
     850             :          * Send out a request for hot-standby backends to check themselves for
     851             :          * deadlocks.
     852             :          *
     853             :          * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
     854             :          * to be signaled by UnpinBuffer() again and send a request for
     855             :          * deadlocks check if deadlock_timeout happens. This causes the
     856             :          * request to continue to be sent every deadlock_timeout until the
     857             :          * buffer is unpinned or ltime is reached. This would increase the
     858             :          * workload in the startup process and backends. In practice it may
     859             :          * not be so harmful because the period that the buffer is kept pinned
     860             :          * is basically no so long. But we should fix this?
     861             :          */
     862          12 :         SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
     863             :     }
     864             : 
     865             :     /*
     866             :      * Clear any timeout requests established above.  We assume here that the
     867             :      * Startup process doesn't have any other timeouts than what this function
     868             :      * uses.  If that stops being true, we could cancel the timeouts
     869             :      * individually, but that'd be slower.
     870             :      */
     871          20 :     disable_all_timeouts(false);
     872          20 :     got_standby_delay_timeout = false;
     873          20 :     got_standby_deadlock_timeout = false;
     874          20 : }
     875             : 
     876             : static void
     877          16 : SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
     878             : {
     879             :     Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
     880             :            reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
     881             : 
     882             :     /*
     883             :      * We send signal to all backends to ask them if they are holding the
     884             :      * buffer pin which is delaying the Startup process. We must not set the
     885             :      * conflict flag yet, since most backends will be innocent. Let the
     886             :      * SIGUSR1 handling in each backend decide their own fate.
     887             :      */
     888          16 :     CancelDBBackends(InvalidOid, reason, false);
     889          16 : }
     890             : 
     891             : /*
     892             :  * In Hot Standby perform early deadlock detection.  We abort the lock
     893             :  * wait if we are about to sleep while holding the buffer pin that Startup
     894             :  * process is waiting for.
     895             :  *
     896             :  * Note: this code is pessimistic, because there is no way for it to
     897             :  * determine whether an actual deadlock condition is present: the lock we
     898             :  * need to wait for might be unrelated to any held by the Startup process.
     899             :  * Sooner or later, this mechanism should get ripped out in favor of somehow
     900             :  * accounting for buffer locks in DeadLockCheck().  However, errors here
     901             :  * seem to be very low-probability in practice, so for now it's not worth
     902             :  * the trouble.
     903             :  */
     904             : void
     905           2 : CheckRecoveryConflictDeadlock(void)
     906             : {
     907             :     Assert(!InRecovery);        /* do not call in Startup process */
     908             : 
     909           2 :     if (!HoldingBufferPinThatDelaysRecovery())
     910           2 :         return;
     911             : 
     912             :     /*
     913             :      * Error message should match ProcessInterrupts() but we avoid calling
     914             :      * that because we aren't handling an interrupt at this point. Note that
     915             :      * we only cancel the current transaction here, so if we are in a
     916             :      * subtransaction and the pin is held by a parent, then the Startup
     917             :      * process will continue to wait even though we have avoided deadlock.
     918             :      */
     919           0 :     ereport(ERROR,
     920             :             (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
     921             :              errmsg("canceling statement due to conflict with recovery"),
     922             :              errdetail("User transaction caused buffer deadlock with recovery.")));
     923             : }
     924             : 
     925             : 
     926             : /* --------------------------------
     927             :  *      timeout handler routines
     928             :  * --------------------------------
     929             :  */
     930             : 
     931             : /*
     932             :  * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
     933             :  * exceeded.
     934             :  */
     935             : void
     936          16 : StandbyDeadLockHandler(void)
     937             : {
     938          16 :     got_standby_deadlock_timeout = true;
     939          16 : }
     940             : 
     941             : /*
     942             :  * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
     943             :  */
     944             : void
     945           2 : StandbyTimeoutHandler(void)
     946             : {
     947           2 :     got_standby_delay_timeout = true;
     948           2 : }
     949             : 
     950             : /*
     951             :  * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
     952             :  */
     953             : void
     954           2 : StandbyLockTimeoutHandler(void)
     955             : {
     956           2 :     got_standby_lock_timeout = true;
     957           2 : }
     958             : 
     959             : /*
     960             :  * -----------------------------------------------------
     961             :  * Locking in Recovery Mode
     962             :  * -----------------------------------------------------
     963             :  *
     964             :  * All locks are held by the Startup process using a single virtual
     965             :  * transaction. This implementation is both simpler and in some senses,
     966             :  * more correct. The locks held mean "some original transaction held
     967             :  * this lock, so query access is not allowed at this time". So the Startup
     968             :  * process is the proxy by which the original locks are implemented.
     969             :  *
     970             :  * We only keep track of AccessExclusiveLocks, which are only ever held by
     971             :  * one transaction on one relation.
     972             :  *
     973             :  * We keep a table of known locks in the RecoveryLockHash hash table.
     974             :  * The point of that table is to let us efficiently de-duplicate locks,
     975             :  * which is important because checkpoints will re-report the same locks
     976             :  * already held.  There is also a RecoveryLockXidHash table with one entry
     977             :  * per xid, which allows us to efficiently find all the locks held by a
     978             :  * given original transaction.
     979             :  *
     980             :  * We use session locks rather than normal locks so we don't need
     981             :  * ResourceOwners.
     982             :  */
     983             : 
     984             : 
     985             : void
     986       50448 : StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
     987             : {
     988             :     RecoveryLockXidEntry *xidentry;
     989             :     RecoveryLockEntry *lockentry;
     990             :     xl_standby_lock key;
     991             :     LOCKTAG     locktag;
     992             :     bool        found;
     993             : 
     994             :     /* Already processed? */
     995      100896 :     if (!TransactionIdIsValid(xid) ||
     996      100890 :         TransactionIdDidCommit(xid) ||
     997       50442 :         TransactionIdDidAbort(xid))
     998           6 :         return;
     999             : 
    1000       50442 :     elog(DEBUG4, "adding recovery lock: db %u rel %u", dbOid, relOid);
    1001             : 
    1002             :     /* dbOid is InvalidOid when we are locking a shared relation. */
    1003             :     Assert(OidIsValid(relOid));
    1004             : 
    1005             :     /* Create a hash entry for this xid, if we don't have one already. */
    1006       50442 :     xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
    1007       50442 :     if (!found)
    1008             :     {
    1009             :         Assert(xidentry->xid == xid);    /* dynahash should have set this */
    1010       20622 :         xidentry->head = NULL;
    1011             :     }
    1012             : 
    1013             :     /* Create a hash entry for this lock, unless we have one already. */
    1014       50442 :     key.xid = xid;
    1015       50442 :     key.dbOid = dbOid;
    1016       50442 :     key.relOid = relOid;
    1017       50442 :     lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
    1018       50442 :     if (!found)
    1019             :     {
    1020             :         /* It's new, so link it into the XID's list ... */
    1021       48106 :         lockentry->next = xidentry->head;
    1022       48106 :         xidentry->head = lockentry;
    1023             : 
    1024             :         /* ... and acquire the lock locally. */
    1025       48106 :         SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
    1026             : 
    1027       48106 :         (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
    1028             :     }
    1029             : }
    1030             : 
    1031             : /*
    1032             :  * Release all the locks associated with this RecoveryLockXidEntry.
    1033             :  */
    1034             : static void
    1035       20622 : StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
    1036             : {
    1037             :     RecoveryLockEntry *entry;
    1038             :     RecoveryLockEntry *next;
    1039             : 
    1040       68728 :     for (entry = xidentry->head; entry != NULL; entry = next)
    1041             :     {
    1042             :         LOCKTAG     locktag;
    1043             : 
    1044       48106 :         elog(DEBUG4,
    1045             :              "releasing recovery lock: xid %u db %u rel %u",
    1046             :              entry->key.xid, entry->key.dbOid, entry->key.relOid);
    1047             :         /* Release the lock ... */
    1048       48106 :         SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
    1049       48106 :         if (!LockRelease(&locktag, AccessExclusiveLock, true))
    1050             :         {
    1051           0 :             elog(LOG,
    1052             :                  "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
    1053             :                  entry->key.xid, entry->key.dbOid, entry->key.relOid);
    1054             :             Assert(false);
    1055             :         }
    1056             :         /* ... and remove the per-lock hash entry */
    1057       48106 :         next = entry->next;
    1058       48106 :         hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
    1059             :     }
    1060             : 
    1061       20622 :     xidentry->head = NULL;       /* just for paranoia */
    1062       20622 : }
    1063             : 
    1064             : /*
    1065             :  * Release locks for specific XID, or all locks if it's InvalidXid.
    1066             :  */
    1067             : static void
    1068       21978 : StandbyReleaseLocks(TransactionId xid)
    1069             : {
    1070             :     RecoveryLockXidEntry *entry;
    1071             : 
    1072       21978 :     if (TransactionIdIsValid(xid))
    1073             :     {
    1074       21978 :         if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
    1075             :         {
    1076       20622 :             StandbyReleaseXidEntryLocks(entry);
    1077       20622 :             hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1078             :         }
    1079             :     }
    1080             :     else
    1081           0 :         StandbyReleaseAllLocks();
    1082       21978 : }
    1083             : 
    1084             : /*
    1085             :  * Release locks for a transaction tree, starting at xid down, from
    1086             :  * RecoveryLockXidHash.
    1087             :  *
    1088             :  * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
    1089             :  * to remove any AccessExclusiveLocks requested by a transaction.
    1090             :  */
    1091             : void
    1092       20976 : StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
    1093             : {
    1094             :     int         i;
    1095             : 
    1096       20976 :     StandbyReleaseLocks(xid);
    1097             : 
    1098       21978 :     for (i = 0; i < nsubxids; i++)
    1099        1002 :         StandbyReleaseLocks(subxids[i]);
    1100       20976 : }
    1101             : 
    1102             : /*
    1103             :  * Called at end of recovery and when we see a shutdown checkpoint.
    1104             :  */
    1105             : void
    1106         208 : StandbyReleaseAllLocks(void)
    1107             : {
    1108             :     HASH_SEQ_STATUS status;
    1109             :     RecoveryLockXidEntry *entry;
    1110             : 
    1111         208 :     elog(DEBUG2, "release all standby locks");
    1112             : 
    1113         208 :     hash_seq_init(&status, RecoveryLockXidHash);
    1114         208 :     while ((entry = hash_seq_search(&status)))
    1115             :     {
    1116           0 :         StandbyReleaseXidEntryLocks(entry);
    1117           0 :         hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1118             :     }
    1119         208 : }
    1120             : 
    1121             : /*
    1122             :  * StandbyReleaseOldLocks
    1123             :  *      Release standby locks held by top-level XIDs that aren't running,
    1124             :  *      as long as they're not prepared transactions.
    1125             :  *
    1126             :  * This is needed to prune the locks of crashed transactions, which didn't
    1127             :  * write an ABORT/COMMIT record.
    1128             :  */
    1129             : void
    1130        1540 : StandbyReleaseOldLocks(TransactionId oldxid)
    1131             : {
    1132             :     HASH_SEQ_STATUS status;
    1133             :     RecoveryLockXidEntry *entry;
    1134             : 
    1135        1540 :     hash_seq_init(&status, RecoveryLockXidHash);
    1136        2122 :     while ((entry = hash_seq_search(&status)))
    1137             :     {
    1138             :         Assert(TransactionIdIsValid(entry->xid));
    1139             : 
    1140             :         /* Skip if prepared transaction. */
    1141         582 :         if (StandbyTransactionIdIsPrepared(entry->xid))
    1142           0 :             continue;
    1143             : 
    1144             :         /* Skip if >= oldxid. */
    1145         582 :         if (!TransactionIdPrecedes(entry->xid, oldxid))
    1146         582 :             continue;
    1147             : 
    1148             :         /* Remove all locks and hash table entry. */
    1149           0 :         StandbyReleaseXidEntryLocks(entry);
    1150           0 :         hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1151             :     }
    1152        1540 : }
    1153             : 
    1154             : /*
    1155             :  * --------------------------------------------------------------------
    1156             :  *      Recovery handling for Rmgr RM_STANDBY_ID
    1157             :  *
    1158             :  * These record types will only be created if XLogStandbyInfoActive()
    1159             :  * --------------------------------------------------------------------
    1160             :  */
    1161             : 
    1162             : void
    1163       51302 : standby_redo(XLogReaderState *record)
    1164             : {
    1165       51302 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    1166             : 
    1167             :     /* Backup blocks are not used in standby records */
    1168             :     Assert(!XLogRecHasAnyBlockRefs(record));
    1169             : 
    1170             :     /* Do nothing if we're not in hot standby mode */
    1171       51302 :     if (standbyState == STANDBY_DISABLED)
    1172         302 :         return;
    1173             : 
    1174       51000 :     if (info == XLOG_STANDBY_LOCK)
    1175             :     {
    1176       48420 :         xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
    1177             :         int         i;
    1178             : 
    1179       98868 :         for (i = 0; i < xlrec->nlocks; i++)
    1180       50448 :             StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
    1181             :                                               xlrec->locks[i].dbOid,
    1182             :                                               xlrec->locks[i].relOid);
    1183             :     }
    1184        2580 :     else if (info == XLOG_RUNNING_XACTS)
    1185             :     {
    1186        1426 :         xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
    1187             :         RunningTransactionsData running;
    1188             : 
    1189        1426 :         running.xcnt = xlrec->xcnt;
    1190        1426 :         running.subxcnt = xlrec->subxcnt;
    1191        1426 :         running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
    1192        1426 :         running.nextXid = xlrec->nextXid;
    1193        1426 :         running.latestCompletedXid = xlrec->latestCompletedXid;
    1194        1426 :         running.oldestRunningXid = xlrec->oldestRunningXid;
    1195        1426 :         running.xids = xlrec->xids;
    1196             : 
    1197        1426 :         ProcArrayApplyRecoveryInfo(&running);
    1198             : 
    1199             :         /*
    1200             :          * The startup process currently has no convenient way to schedule
    1201             :          * stats to be reported. XLOG_RUNNING_XACTS records issued at a
    1202             :          * regular cadence, making this a convenient location to report stats.
    1203             :          * While these records aren't generated with wal_level=minimal, stats
    1204             :          * also cannot be accessed during WAL replay.
    1205             :          */
    1206        1426 :         pgstat_report_stat(true);
    1207             :     }
    1208        1154 :     else if (info == XLOG_INVALIDATIONS)
    1209             :     {
    1210        1154 :         xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
    1211             : 
    1212        1154 :         ProcessCommittedInvalidationMessages(xlrec->msgs,
    1213             :                                              xlrec->nmsgs,
    1214        1154 :                                              xlrec->relcacheInitFileInval,
    1215             :                                              xlrec->dbId,
    1216             :                                              xlrec->tsId);
    1217             :     }
    1218             :     else
    1219           0 :         elog(PANIC, "standby_redo: unknown op code %u", info);
    1220             : }
    1221             : 
    1222             : /*
    1223             :  * Log details of the current snapshot to WAL. This allows the snapshot state
    1224             :  * to be reconstructed on the standby and for logical decoding.
    1225             :  *
    1226             :  * This is used for Hot Standby as follows:
    1227             :  *
    1228             :  * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
    1229             :  * start from a shutdown checkpoint because we know nothing was running
    1230             :  * at that time and our recovery snapshot is known empty. In the more
    1231             :  * typical case of an online checkpoint we need to jump through a few
    1232             :  * hoops to get a correct recovery snapshot and this requires a two or
    1233             :  * sometimes a three stage process.
    1234             :  *
    1235             :  * The initial snapshot must contain all running xids and all current
    1236             :  * AccessExclusiveLocks at a point in time on the standby. Assembling
    1237             :  * that information while the server is running requires many and
    1238             :  * various LWLocks, so we choose to derive that information piece by
    1239             :  * piece and then re-assemble that info on the standby. When that
    1240             :  * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
    1241             :  *
    1242             :  * Since locking on the primary when we derive the information is not
    1243             :  * strict, we note that there is a time window between the derivation and
    1244             :  * writing to WAL of the derived information. That allows race conditions
    1245             :  * that we must resolve, since xids and locks may enter or leave the
    1246             :  * snapshot during that window. This creates the issue that an xid or
    1247             :  * lock may start *after* the snapshot has been derived yet *before* the
    1248             :  * snapshot is logged in the running xacts WAL record. We resolve this by
    1249             :  * starting to accumulate changes at a point just prior to when we derive
    1250             :  * the snapshot on the primary, then ignore duplicates when we later apply
    1251             :  * the snapshot from the running xacts record. This is implemented during
    1252             :  * CreateCheckPoint() where we use the logical checkpoint location as
    1253             :  * our starting point and then write the running xacts record immediately
    1254             :  * before writing the main checkpoint WAL record. Since we always start
    1255             :  * up from a checkpoint and are immediately at our starting point, we
    1256             :  * unconditionally move to STANDBY_INITIALIZED. After this point we
    1257             :  * must do 4 things:
    1258             :  *  * move shared nextXid forwards as we see new xids
    1259             :  *  * extend the clog and subtrans with each new xid
    1260             :  *  * keep track of uncommitted known assigned xids
    1261             :  *  * keep track of uncommitted AccessExclusiveLocks
    1262             :  *
    1263             :  * When we see a commit/abort we must remove known assigned xids and locks
    1264             :  * from the completing transaction. Attempted removals that cannot locate
    1265             :  * an entry are expected and must not cause an error when we are in state
    1266             :  * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
    1267             :  * KnownAssignedXidsRemove().
    1268             :  *
    1269             :  * Later, when we apply the running xact data we must be careful to ignore
    1270             :  * transactions already committed, since those commits raced ahead when
    1271             :  * making WAL entries.
    1272             :  *
    1273             :  * For logical decoding only the running xacts information is needed;
    1274             :  * there's no need to look at the locking information, but it's logged anyway,
    1275             :  * as there's no independent knob to just enable logical decoding. For
    1276             :  * details of how this is used, check snapbuild.c's introductory comment.
    1277             :  *
    1278             :  *
    1279             :  * Returns the RecPtr of the last inserted record.
    1280             :  */
    1281             : XLogRecPtr
    1282        2650 : LogStandbySnapshot(void)
    1283             : {
    1284             :     XLogRecPtr  recptr;
    1285             :     RunningTransactions running;
    1286             :     xl_standby_lock *locks;
    1287             :     int         nlocks;
    1288             : 
    1289             :     Assert(XLogStandbyInfoActive());
    1290             : 
    1291             : #ifdef USE_INJECTION_POINTS
    1292        2650 :     if (IS_INJECTION_POINT_ATTACHED("skip-log-running-xacts"))
    1293             :     {
    1294             :         /*
    1295             :          * This record could move slot's xmin forward during decoding, leading
    1296             :          * to unpredictable results, so skip it when requested by the test.
    1297             :          */
    1298           0 :         return GetInsertRecPtr();
    1299             :     }
    1300             : #endif
    1301             : 
    1302             :     /*
    1303             :      * Get details of any AccessExclusiveLocks being held at the moment.
    1304             :      */
    1305        2650 :     locks = GetRunningTransactionLocks(&nlocks);
    1306        2650 :     if (nlocks > 0)
    1307         282 :         LogAccessExclusiveLocks(nlocks, locks);
    1308        2650 :     pfree(locks);
    1309             : 
    1310             :     /*
    1311             :      * Log details of all in-progress transactions. This should be the last
    1312             :      * record we write, because standby will open up when it sees this.
    1313             :      */
    1314        2650 :     running = GetRunningTransactionData();
    1315             : 
    1316             :     /*
    1317             :      * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
    1318             :      * For Hot Standby this can be done before inserting the WAL record
    1319             :      * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
    1320             :      * the clog. For logical decoding, though, the lock can't be released
    1321             :      * early because the clog might be "in the future" from the POV of the
    1322             :      * historic snapshot. This would allow for situations where we're waiting
    1323             :      * for the end of a transaction listed in the xl_running_xacts record
    1324             :      * which, according to the WAL, has committed before the xl_running_xacts
    1325             :      * record. Fortunately this routine isn't executed frequently, and it's
    1326             :      * only a shared lock.
    1327             :      */
    1328        2650 :     if (wal_level < WAL_LEVEL_LOGICAL)
    1329        1676 :         LWLockRelease(ProcArrayLock);
    1330             : 
    1331        2650 :     recptr = LogCurrentRunningXacts(running);
    1332             : 
    1333             :     /* Release lock if we kept it longer ... */
    1334        2650 :     if (wal_level >= WAL_LEVEL_LOGICAL)
    1335         974 :         LWLockRelease(ProcArrayLock);
    1336             : 
    1337             :     /* GetRunningTransactionData() acquired XidGenLock, we must release it */
    1338        2650 :     LWLockRelease(XidGenLock);
    1339             : 
    1340        2650 :     return recptr;
    1341             : }
    1342             : 
    1343             : /*
    1344             :  * Record an enhanced snapshot of running transactions into WAL.
    1345             :  *
    1346             :  * The definitions of RunningTransactionsData and xl_running_xacts are
    1347             :  * similar. We keep them separate because xl_running_xacts is a contiguous
    1348             :  * chunk of memory and never exists fully until it is assembled in WAL.
    1349             :  * The inserted records are marked as not being important for durability,
    1350             :  * to avoid triggering superfluous checkpoint / archiving activity.
    1351             :  */
    1352             : static XLogRecPtr
    1353        2650 : LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
    1354             : {
    1355             :     xl_running_xacts xlrec;
    1356             :     XLogRecPtr  recptr;
    1357             : 
    1358        2650 :     xlrec.xcnt = CurrRunningXacts->xcnt;
    1359        2650 :     xlrec.subxcnt = CurrRunningXacts->subxcnt;
    1360        2650 :     xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY);
    1361        2650 :     xlrec.nextXid = CurrRunningXacts->nextXid;
    1362        2650 :     xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
    1363        2650 :     xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
    1364             : 
    1365             :     /* Header */
    1366        2650 :     XLogBeginInsert();
    1367        2650 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    1368        2650 :     XLogRegisterData(&xlrec, MinSizeOfXactRunningXacts);
    1369             : 
    1370             :     /* array of TransactionIds */
    1371        2650 :     if (xlrec.xcnt > 0)
    1372         840 :         XLogRegisterData(CurrRunningXacts->xids,
    1373         840 :                          (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
    1374             : 
    1375        2650 :     recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
    1376             : 
    1377        2650 :     if (xlrec.subxid_overflow)
    1378           2 :         elog(DEBUG2,
    1379             :              "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
    1380             :              CurrRunningXacts->xcnt,
    1381             :              LSN_FORMAT_ARGS(recptr),
    1382             :              CurrRunningXacts->oldestRunningXid,
    1383             :              CurrRunningXacts->latestCompletedXid,
    1384             :              CurrRunningXacts->nextXid);
    1385             :     else
    1386        2648 :         elog(DEBUG2,
    1387             :              "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
    1388             :              CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
    1389             :              LSN_FORMAT_ARGS(recptr),
    1390             :              CurrRunningXacts->oldestRunningXid,
    1391             :              CurrRunningXacts->latestCompletedXid,
    1392             :              CurrRunningXacts->nextXid);
    1393             : 
    1394             :     /*
    1395             :      * Ensure running_xacts information is synced to disk not too far in the
    1396             :      * future. We don't want to stall anything though (i.e. use XLogFlush()),
    1397             :      * so we let the wal writer do it during normal operation.
    1398             :      * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
    1399             :      * and nudge the WALWriter into action if sleeping. Check
    1400             :      * XLogBackgroundFlush() for details why a record might not be flushed
    1401             :      * without it.
    1402             :      */
    1403        2650 :     XLogSetAsyncXactLSN(recptr);
    1404             : 
    1405        2650 :     return recptr;
    1406             : }
    1407             : 
    1408             : /*
    1409             :  * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
    1410             :  * logged, as described in backend/storage/lmgr/README.
    1411             :  */
    1412             : static void
    1413      200378 : LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
    1414             : {
    1415             :     xl_standby_locks xlrec;
    1416             : 
    1417      200378 :     xlrec.nlocks = nlocks;
    1418             : 
    1419      200378 :     XLogBeginInsert();
    1420      200378 :     XLogRegisterData(&xlrec, offsetof(xl_standby_locks, locks));
    1421      200378 :     XLogRegisterData(locks, nlocks * sizeof(xl_standby_lock));
    1422      200378 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    1423             : 
    1424      200378 :     (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
    1425      200378 : }
    1426             : 
    1427             : /*
    1428             :  * Individual logging of AccessExclusiveLocks for use during LockAcquire()
    1429             :  */
    1430             : void
    1431      200096 : LogAccessExclusiveLock(Oid dbOid, Oid relOid)
    1432             : {
    1433             :     xl_standby_lock xlrec;
    1434             : 
    1435      200096 :     xlrec.xid = GetCurrentTransactionId();
    1436             : 
    1437      200096 :     xlrec.dbOid = dbOid;
    1438      200096 :     xlrec.relOid = relOid;
    1439             : 
    1440      200096 :     LogAccessExclusiveLocks(1, &xlrec);
    1441      200096 :     MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
    1442      200096 : }
    1443             : 
    1444             : /*
    1445             :  * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
    1446             :  */
    1447             : void
    1448      200522 : LogAccessExclusiveLockPrepare(void)
    1449             : {
    1450             :     /*
    1451             :      * Ensure that a TransactionId has been assigned to this transaction, for
    1452             :      * two reasons, both related to lock release on the standby. First, we
    1453             :      * must assign an xid so that RecordTransactionCommit() and
    1454             :      * RecordTransactionAbort() do not optimise away the transaction
    1455             :      * completion record which recovery relies upon to release locks. It's a
    1456             :      * hack, but for a corner case not worth adding code for into the main
    1457             :      * commit path. Second, we must assign an xid before the lock is recorded
    1458             :      * in shared memory, otherwise a concurrently executing
    1459             :      * GetRunningTransactionLocks() might see a lock associated with an
    1460             :      * InvalidTransactionId which we later assert cannot happen.
    1461             :      */
    1462      200522 :     (void) GetCurrentTransactionId();
    1463      200522 : }
    1464             : 
    1465             : /*
    1466             :  * Emit WAL for invalidations. This currently is only used for commits without
    1467             :  * an xid but which contain invalidations.
    1468             :  */
    1469             : void
    1470       16928 : LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
    1471             :                         bool relcacheInitFileInval)
    1472             : {
    1473             :     xl_invalidations xlrec;
    1474             : 
    1475             :     /* prepare record */
    1476       16928 :     memset(&xlrec, 0, sizeof(xlrec));
    1477       16928 :     xlrec.dbId = MyDatabaseId;
    1478       16928 :     xlrec.tsId = MyDatabaseTableSpace;
    1479       16928 :     xlrec.relcacheInitFileInval = relcacheInitFileInval;
    1480       16928 :     xlrec.nmsgs = nmsgs;
    1481             : 
    1482             :     /* perform insertion */
    1483       16928 :     XLogBeginInsert();
    1484       16928 :     XLogRegisterData(&xlrec, MinSizeOfInvalidations);
    1485       16928 :     XLogRegisterData(msgs,
    1486             :                      nmsgs * sizeof(SharedInvalidationMessage));
    1487       16928 :     XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
    1488       16928 : }
    1489             : 
    1490             : /* Return the description of recovery conflict */
    1491             : static const char *
    1492          20 : get_recovery_conflict_desc(ProcSignalReason reason)
    1493             : {
    1494          20 :     const char *reasonDesc = _("unknown reason");
    1495             : 
    1496          20 :     switch (reason)
    1497             :     {
    1498           8 :         case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
    1499           8 :             reasonDesc = _("recovery conflict on buffer pin");
    1500           8 :             break;
    1501           4 :         case PROCSIG_RECOVERY_CONFLICT_LOCK:
    1502           4 :             reasonDesc = _("recovery conflict on lock");
    1503           4 :             break;
    1504           4 :         case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
    1505           4 :             reasonDesc = _("recovery conflict on tablespace");
    1506           4 :             break;
    1507           4 :         case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
    1508           4 :             reasonDesc = _("recovery conflict on snapshot");
    1509           4 :             break;
    1510           0 :         case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT:
    1511           0 :             reasonDesc = _("recovery conflict on replication slot");
    1512           0 :             break;
    1513           0 :         case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
    1514           0 :             reasonDesc = _("recovery conflict on buffer deadlock");
    1515           0 :             break;
    1516           0 :         case PROCSIG_RECOVERY_CONFLICT_DATABASE:
    1517           0 :             reasonDesc = _("recovery conflict on database");
    1518           0 :             break;
    1519           0 :         default:
    1520           0 :             break;
    1521             :     }
    1522             : 
    1523          20 :     return reasonDesc;
    1524             : }

Generated by: LCOV version 1.14