LCOV - PostgreSQL 19devel - src/backend/storage/ipc/standby.c

LCOV - code coverage report

Current view:	top level - src/backend/storage/ipc - standby.c (source / functions)		Coverage	Total	Hit
Test:	PostgreSQL 19devel	Lines:	91.3 %	381	348
Test Date:	2026-02-17 17:20:33	Functions:	100.0 %	31	31
Legend:	Lines: hit not hit

            Line data    Source code

       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * standby.c
       4              :  *    Misc functions used in Hot Standby mode.
       5              :  *
       6              :  *  All functions for handling RM_STANDBY_ID, which relate to
       7              :  *  AccessExclusiveLocks and starting snapshots for Hot Standby mode.
       8              :  *  Plus conflict recovery processing.
       9              :  *
      10              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      11              :  * Portions Copyright (c) 1994, Regents of the University of California
      12              :  *
      13              :  * IDENTIFICATION
      14              :  *    src/backend/storage/ipc/standby.c
      15              :  *
      16              :  *-------------------------------------------------------------------------
      17              :  */
      18              : #include "postgres.h"
      19              : #include "access/transam.h"
      20              : #include "access/twophase.h"
      21              : #include "access/xact.h"
      22              : #include "access/xloginsert.h"
      23              : #include "access/xlogrecovery.h"
      24              : #include "access/xlogutils.h"
      25              : #include "miscadmin.h"
      26              : #include "pgstat.h"
      27              : #include "replication/slot.h"
      28              : #include "storage/bufmgr.h"
      29              : #include "storage/proc.h"
      30              : #include "storage/procarray.h"
      31              : #include "storage/sinvaladt.h"
      32              : #include "storage/standby.h"
      33              : #include "utils/hsearch.h"
      34              : #include "utils/injection_point.h"
      35              : #include "utils/ps_status.h"
      36              : #include "utils/timeout.h"
      37              : #include "utils/timestamp.h"
      38              : 
      39              : /* User-settable GUC parameters */
      40              : int         max_standby_archive_delay = 30 * 1000;
      41              : int         max_standby_streaming_delay = 30 * 1000;
      42              : bool        log_recovery_conflict_waits = false;
      43              : 
      44              : /*
      45              :  * Keep track of all the exclusive locks owned by original transactions.
      46              :  * For each known exclusive lock, there is a RecoveryLockEntry in the
      47              :  * RecoveryLockHash hash table.  All RecoveryLockEntrys belonging to a
      48              :  * given XID are chained together so that we can find them easily.
      49              :  * For each original transaction that is known to have any such locks,
      50              :  * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
      51              :  * which stores the head of the chain of its locks.
      52              :  */
      53              : typedef struct RecoveryLockEntry
      54              : {
      55              :     xl_standby_lock key;        /* hash key: xid, dbOid, relOid */
      56              :     struct RecoveryLockEntry *next; /* chain link */
      57              : } RecoveryLockEntry;
      58              : 
      59              : typedef struct RecoveryLockXidEntry
      60              : {
      61              :     TransactionId xid;          /* hash key -- must be first */
      62              :     struct RecoveryLockEntry *head; /* chain head */
      63              : } RecoveryLockXidEntry;
      64              : 
      65              : static HTAB *RecoveryLockHash = NULL;
      66              : static HTAB *RecoveryLockXidHash = NULL;
      67              : 
      68              : /* Flags set by timeout handlers */
      69              : static volatile sig_atomic_t got_standby_deadlock_timeout = false;
      70              : static volatile sig_atomic_t got_standby_delay_timeout = false;
      71              : static volatile sig_atomic_t got_standby_lock_timeout = false;
      72              : 
      73              : static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
      74              :                                                    RecoveryConflictReason reason,
      75              :                                                    uint32 wait_event_info,
      76              :                                                    bool report_waiting);
      77              : static void SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason);
      78              : static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
      79              : static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
      80              : static const char *get_recovery_conflict_desc(RecoveryConflictReason reason);
      81              : 
      82              : /*
      83              :  * InitRecoveryTransactionEnvironment
      84              :  *      Initialize tracking of our primary's in-progress transactions.
      85              :  *
      86              :  * We need to issue shared invalidations and hold locks. Holding locks
      87              :  * means others may want to wait on us, so we need to make a lock table
      88              :  * vxact entry like a real transaction. We could create and delete
      89              :  * lock table entries for each transaction but its simpler just to create
      90              :  * one permanent entry and leave it there all the time. Locks are then
      91              :  * acquired and released as needed. Yes, this means you can see the
      92              :  * Startup process in pg_locks once we have run this.
      93              :  */
      94              : void
      95          114 : InitRecoveryTransactionEnvironment(void)
      96              : {
      97              :     VirtualTransactionId vxid;
      98              :     HASHCTL     hash_ctl;
      99              : 
     100              :     Assert(RecoveryLockHash == NULL);   /* don't run this twice */
     101              : 
     102              :     /*
     103              :      * Initialize the hash tables for tracking the locks held by each
     104              :      * transaction.
     105              :      */
     106          114 :     hash_ctl.keysize = sizeof(xl_standby_lock);
     107          114 :     hash_ctl.entrysize = sizeof(RecoveryLockEntry);
     108          114 :     RecoveryLockHash = hash_create("RecoveryLockHash",
     109              :                                    64,
     110              :                                    &hash_ctl,
     111              :                                    HASH_ELEM | HASH_BLOBS);
     112          114 :     hash_ctl.keysize = sizeof(TransactionId);
     113          114 :     hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
     114          114 :     RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
     115              :                                       64,
     116              :                                       &hash_ctl,
     117              :                                       HASH_ELEM | HASH_BLOBS);
     118              : 
     119              :     /*
     120              :      * Initialize shared invalidation management for Startup process, being
     121              :      * careful to register ourselves as a sendOnly process so we don't need to
     122              :      * read messages, nor will we get signaled when the queue starts filling
     123              :      * up.
     124              :      */
     125          114 :     SharedInvalBackendInit(true);
     126              : 
     127              :     /*
     128              :      * Lock a virtual transaction id for Startup process.
     129              :      *
     130              :      * We need to do GetNextLocalTransactionId() because
     131              :      * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
     132              :      * manager doesn't like that at all.
     133              :      *
     134              :      * Note that we don't need to run XactLockTableInsert() because nobody
     135              :      * needs to wait on xids. That sounds a little strange, but table locks
     136              :      * are held by vxids and row level locks are held by xids. All queries
     137              :      * hold AccessShareLocks so never block while we write or lock new rows.
     138              :      */
     139          114 :     MyProc->vxid.procNumber = MyProcNumber;
     140          114 :     vxid.procNumber = MyProcNumber;
     141          114 :     vxid.localTransactionId = GetNextLocalTransactionId();
     142          114 :     VirtualXactLockTableInsert(vxid);
     143              : 
     144          114 :     standbyState = STANDBY_INITIALIZED;
     145          114 : }
     146              : 
     147              : /*
     148              :  * ShutdownRecoveryTransactionEnvironment
     149              :  *      Shut down transaction tracking
     150              :  *
     151              :  * Prepare to switch from hot standby mode to normal operation. Shut down
     152              :  * recovery-time transaction tracking.
     153              :  *
     154              :  * This must be called even in shutdown of startup process if transaction
     155              :  * tracking has been initialized. Otherwise some locks the tracked
     156              :  * transactions were holding will not be released and may interfere with
     157              :  * the processes still running (but will exit soon later) at the exit of
     158              :  * startup process.
     159              :  */
     160              : void
     161          169 : ShutdownRecoveryTransactionEnvironment(void)
     162              : {
     163              :     /*
     164              :      * Do nothing if RecoveryLockHash is NULL because that means that
     165              :      * transaction tracking has not yet been initialized or has already been
     166              :      * shut down.  This makes it safe to have possibly-redundant calls of this
     167              :      * function during process exit.
     168              :      */
     169          169 :     if (RecoveryLockHash == NULL)
     170           55 :         return;
     171              : 
     172              :     /* Mark all tracked in-progress transactions as finished. */
     173          114 :     ExpireAllKnownAssignedTransactionIds();
     174              : 
     175              :     /* Release all locks the tracked transactions were holding */
     176          114 :     StandbyReleaseAllLocks();
     177              : 
     178              :     /* Destroy the lock hash tables. */
     179          114 :     hash_destroy(RecoveryLockHash);
     180          114 :     hash_destroy(RecoveryLockXidHash);
     181          114 :     RecoveryLockHash = NULL;
     182          114 :     RecoveryLockXidHash = NULL;
     183              : 
     184              :     /* Cleanup our VirtualTransaction */
     185          114 :     VirtualXactLockTableCleanup();
     186              : }
     187              : 
     188              : 
     189              : /*
     190              :  * -----------------------------------------------------
     191              :  *      Standby wait timers and backend cancel logic
     192              :  * -----------------------------------------------------
     193              :  */
     194              : 
     195              : /*
     196              :  * Determine the cutoff time at which we want to start canceling conflicting
     197              :  * transactions.  Returns zero (a time safely in the past) if we are willing
     198              :  * to wait forever.
     199              :  */
     200              : static TimestampTz
     201           28 : GetStandbyLimitTime(void)
     202              : {
     203              :     TimestampTz rtime;
     204              :     bool        fromStream;
     205              : 
     206              :     /*
     207              :      * The cutoff time is the last WAL data receipt time plus the appropriate
     208              :      * delay variable.  Delay of -1 means wait forever.
     209              :      */
     210           28 :     GetXLogReceiptTime(&rtime, &fromStream);
     211           28 :     if (fromStream)
     212              :     {
     213           28 :         if (max_standby_streaming_delay < 0)
     214            0 :             return 0;           /* wait forever */
     215           28 :         return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
     216              :     }
     217              :     else
     218              :     {
     219            0 :         if (max_standby_archive_delay < 0)
     220            0 :             return 0;           /* wait forever */
     221            0 :         return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
     222              :     }
     223              : }
     224              : 
     225              : #define STANDBY_INITIAL_WAIT_US  1000
     226              : static int  standbyWait_us = STANDBY_INITIAL_WAIT_US;
     227              : 
     228              : /*
     229              :  * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
     230              :  * We wait here for a while then return. If we decide we can't wait any
     231              :  * more then we return true, if we can wait some more return false.
     232              :  */
     233              : static bool
     234           15 : WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
     235              : {
     236              :     TimestampTz ltime;
     237              : 
     238           15 :     CHECK_FOR_INTERRUPTS();
     239              : 
     240              :     /* Are we past the limit time? */
     241           15 :     ltime = GetStandbyLimitTime();
     242           15 :     if (ltime && GetCurrentTimestamp() >= ltime)
     243            3 :         return true;
     244              : 
     245              :     /*
     246              :      * Sleep a bit (this is essential to avoid busy-waiting).
     247              :      */
     248           12 :     pgstat_report_wait_start(wait_event_info);
     249           12 :     pg_usleep(standbyWait_us);
     250           12 :     pgstat_report_wait_end();
     251              : 
     252              :     /*
     253              :      * Progressively increase the sleep times, but not to more than 1s, since
     254              :      * pg_usleep isn't interruptible on some platforms.
     255              :      */
     256           12 :     standbyWait_us *= 2;
     257           12 :     if (standbyWait_us > 1000000)
     258            0 :         standbyWait_us = 1000000;
     259              : 
     260           12 :     return false;
     261              : }
     262              : 
     263              : /*
     264              :  * Log the recovery conflict.
     265              :  *
     266              :  * wait_start is the timestamp when the caller started to wait.
     267              :  * now is the timestamp when this function has been called.
     268              :  * wait_list is the list of virtual transaction ids assigned to
     269              :  * conflicting processes. still_waiting indicates whether
     270              :  * the startup process is still waiting for the recovery conflict
     271              :  * to be resolved or not.
     272              :  */
     273              : void
     274           10 : LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start,
     275              :                     TimestampTz now, VirtualTransactionId *wait_list,
     276              :                     bool still_waiting)
     277              : {
     278              :     long        secs;
     279              :     int         usecs;
     280              :     long        msecs;
     281              :     StringInfoData buf;
     282           10 :     int         nprocs = 0;
     283              : 
     284              :     /*
     285              :      * There must be no conflicting processes when the recovery conflict has
     286              :      * already been resolved.
     287              :      */
     288              :     Assert(still_waiting || wait_list == NULL);
     289              : 
     290           10 :     TimestampDifference(wait_start, now, &secs, &usecs);
     291           10 :     msecs = secs * 1000 + usecs / 1000;
     292           10 :     usecs = usecs % 1000;
     293              : 
     294           10 :     if (wait_list)
     295              :     {
     296              :         VirtualTransactionId *vxids;
     297              : 
     298              :         /* Construct a string of list of the conflicting processes */
     299            3 :         vxids = wait_list;
     300            6 :         while (VirtualTransactionIdIsValid(*vxids))
     301              :         {
     302            3 :             PGPROC     *proc = ProcNumberGetProc(vxids->procNumber);
     303              : 
     304              :             /* proc can be NULL if the target backend is not active */
     305            3 :             if (proc)
     306              :             {
     307            3 :                 if (nprocs == 0)
     308              :                 {
     309            3 :                     initStringInfo(&buf);
     310            3 :                     appendStringInfo(&buf, "%d", proc->pid);
     311              :                 }
     312              :                 else
     313            0 :                     appendStringInfo(&buf, ", %d", proc->pid);
     314              : 
     315            3 :                 nprocs++;
     316              :             }
     317              : 
     318            3 :             vxids++;
     319              :         }
     320              :     }
     321              : 
     322              :     /*
     323              :      * If wait_list is specified, report the list of PIDs of active
     324              :      * conflicting backends in a detail message. Note that if all the backends
     325              :      * in the list are not active, no detail message is logged.
     326              :      */
     327           10 :     if (still_waiting)
     328              :     {
     329            5 :         ereport(LOG,
     330              :                 errmsg("recovery still waiting after %ld.%03d ms: %s",
     331              :                        msecs, usecs, get_recovery_conflict_desc(reason)),
     332              :                 nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
     333              :                                                   "Conflicting processes: %s.",
     334              :                                                   nprocs, buf.data) : 0);
     335              :     }
     336              :     else
     337              :     {
     338            5 :         ereport(LOG,
     339              :                 errmsg("recovery finished waiting after %ld.%03d ms: %s",
     340              :                        msecs, usecs, get_recovery_conflict_desc(reason)));
     341              :     }
     342              : 
     343           10 :     if (nprocs > 0)
     344            3 :         pfree(buf.data);
     345           10 : }
     346              : 
     347              : /*
     348              :  * This is the main executioner for any query backend that conflicts with
     349              :  * recovery processing. Judgement has already been passed on it within
     350              :  * a specific rmgr. Here we just issue the orders to the procs. The procs
     351              :  * then throw the required error as instructed.
     352              :  *
     353              :  * If report_waiting is true, "waiting" is reported in PS display and the
     354              :  * wait for recovery conflict is reported in the log, if necessary. If
     355              :  * the caller is responsible for reporting them, report_waiting should be
     356              :  * false. Otherwise, both the caller and this function report the same
     357              :  * thing unexpectedly.
     358              :  */
     359              : static void
     360        14044 : ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
     361              :                                        RecoveryConflictReason reason,
     362              :                                        uint32 wait_event_info,
     363              :                                        bool report_waiting)
     364              : {
     365        14044 :     TimestampTz waitStart = 0;
     366        14044 :     bool        waiting = false;
     367        14044 :     bool        logged_recovery_conflict = false;
     368              : 
     369              :     /* Fast exit, to avoid a kernel call if there's no work to be done. */
     370        14044 :     if (!VirtualTransactionIdIsValid(*waitlist))
     371        14041 :         return;
     372              : 
     373              :     /* Set the wait start timestamp for reporting */
     374            3 :     if (report_waiting && (log_recovery_conflict_waits || update_process_title))
     375            2 :         waitStart = GetCurrentTimestamp();
     376              : 
     377            6 :     while (VirtualTransactionIdIsValid(*waitlist))
     378              :     {
     379              :         /* reset standbyWait_us for each xact we wait for */
     380            3 :         standbyWait_us = STANDBY_INITIAL_WAIT_US;
     381              : 
     382              :         /* wait until the virtual xid is gone */
     383           18 :         while (!VirtualXactLock(*waitlist, false))
     384              :         {
     385              :             /* Is it time to kill it? */
     386           15 :             if (WaitExceedsMaxStandbyDelay(wait_event_info))
     387              :             {
     388              :                 bool        signaled;
     389              : 
     390              :                 /*
     391              :                  * Now find out who to throw out of the balloon.
     392              :                  */
     393              :                 Assert(VirtualTransactionIdIsValid(*waitlist));
     394            3 :                 signaled = SignalRecoveryConflictWithVirtualXID(*waitlist, reason);
     395              : 
     396              :                 /*
     397              :                  * Wait a little bit for it to die so that we avoid flooding
     398              :                  * an unresponsive backend when system is heavily loaded.
     399              :                  */
     400            3 :                 if (signaled)
     401            3 :                     pg_usleep(5000L);
     402              :             }
     403              : 
     404           15 :             if (waitStart != 0 && (!logged_recovery_conflict || !waiting))
     405              :             {
     406           14 :                 TimestampTz now = 0;
     407              :                 bool        maybe_log_conflict;
     408              :                 bool        maybe_update_title;
     409              : 
     410           14 :                 maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
     411           14 :                 maybe_update_title = (update_process_title && !waiting);
     412              : 
     413              :                 /* Get the current timestamp if not report yet */
     414           14 :                 if (maybe_log_conflict || maybe_update_title)
     415           14 :                     now = GetCurrentTimestamp();
     416              : 
     417              :                 /*
     418              :                  * Report via ps if we have been waiting for more than 500
     419              :                  * msec (should that be configurable?)
     420              :                  */
     421           28 :                 if (maybe_update_title &&
     422           14 :                     TimestampDifferenceExceeds(waitStart, now, 500))
     423              :                 {
     424            0 :                     set_ps_display_suffix("waiting");
     425            0 :                     waiting = true;
     426              :                 }
     427              : 
     428              :                 /*
     429              :                  * Emit the log message if the startup process is waiting
     430              :                  * longer than deadlock_timeout for recovery conflict.
     431              :                  */
     432           22 :                 if (maybe_log_conflict &&
     433            8 :                     TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
     434              :                 {
     435            2 :                     LogRecoveryConflict(reason, waitStart, now, waitlist, true);
     436            2 :                     logged_recovery_conflict = true;
     437              :                 }
     438              :             }
     439              :         }
     440              : 
     441              :         /* The virtual transaction is gone now, wait for the next one */
     442            3 :         waitlist++;
     443              :     }
     444              : 
     445              :     /*
     446              :      * Emit the log message if recovery conflict was resolved but the startup
     447              :      * process waited longer than deadlock_timeout for it.
     448              :      */
     449            3 :     if (logged_recovery_conflict)
     450            2 :         LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
     451              :                             NULL, false);
     452              : 
     453              :     /* reset ps display to remove the suffix if we added one */
     454            3 :     if (waiting)
     455            0 :         set_ps_display_remove_suffix();
     456              : 
     457              : }
     458              : 
     459              : /*
     460              :  * Generate whatever recovery conflicts are needed to eliminate snapshots that
     461              :  * might see XIDs <= snapshotConflictHorizon as still running.
     462              :  *
     463              :  * snapshotConflictHorizon cutoffs are our standard approach to generating
     464              :  * granular recovery conflicts.  Note that InvalidTransactionId values are
     465              :  * interpreted as "definitely don't need any conflicts" here, which is a
     466              :  * general convention that WAL records can (and often do) depend on.
     467              :  */
     468              : void
     469        16874 : ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
     470              :                                     bool isCatalogRel,
     471              :                                     RelFileLocator locator)
     472              : {
     473              :     VirtualTransactionId *backends;
     474              : 
     475              :     /*
     476              :      * If we get passed InvalidTransactionId then we do nothing (no conflict).
     477              :      *
     478              :      * This can happen when replaying already-applied WAL records after a
     479              :      * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
     480              :      * record that marks as frozen a page which was already all-visible.  It's
     481              :      * also quite common with records generated during index deletion
     482              :      * (original execution of the deletion can reason that a recovery conflict
     483              :      * which is sufficient for the deletion operation must take place before
     484              :      * replay of the deletion record itself).
     485              :      */
     486        16874 :     if (!TransactionIdIsValid(snapshotConflictHorizon))
     487         2832 :         return;
     488              : 
     489              :     Assert(TransactionIdIsNormal(snapshotConflictHorizon));
     490        14042 :     backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
     491              :                                          locator.dbOid);
     492        14042 :     ResolveRecoveryConflictWithVirtualXIDs(backends,
     493              :                                            RECOVERY_CONFLICT_SNAPSHOT,
     494              :                                            WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
     495              :                                            true);
     496              : 
     497              :     /*
     498              :      * Note that WaitExceedsMaxStandbyDelay() is not taken into account here
     499              :      * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That
     500              :      * seems OK, given that this kind of conflict should not normally be
     501              :      * reached, e.g. due to using a physical replication slot.
     502              :      */
     503        14042 :     if (IsLogicalDecodingEnabled() && isCatalogRel)
     504           16 :         InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid,
     505              :                                            snapshotConflictHorizon);
     506              : }
     507              : 
     508              : /*
     509              :  * Variant of ResolveRecoveryConflictWithSnapshot that works with
     510              :  * FullTransactionId values
     511              :  */
     512              : void
     513           69 : ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
     514              :                                            bool isCatalogRel,
     515              :                                            RelFileLocator locator)
     516              : {
     517              :     /*
     518              :      * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
     519              :      * so truncate the logged FullTransactionId.  If the logged value is very
     520              :      * old, so that XID wrap-around already happened on it, there can't be any
     521              :      * snapshots that still see it.
     522              :      */
     523           69 :     FullTransactionId nextXid = ReadNextFullTransactionId();
     524              :     uint64      diff;
     525              : 
     526           69 :     diff = U64FromFullTransactionId(nextXid) -
     527           69 :         U64FromFullTransactionId(snapshotConflictHorizon);
     528           69 :     if (diff < MaxTransactionId / 2)
     529              :     {
     530              :         TransactionId truncated;
     531              : 
     532           69 :         truncated = XidFromFullTransactionId(snapshotConflictHorizon);
     533           69 :         ResolveRecoveryConflictWithSnapshot(truncated,
     534              :                                             isCatalogRel,
     535              :                                             locator);
     536              :     }
     537           69 : }
     538              : 
     539              : void
     540            1 : ResolveRecoveryConflictWithTablespace(Oid tsid)
     541              : {
     542              :     VirtualTransactionId *temp_file_users;
     543              : 
     544              :     /*
     545              :      * Standby users may be currently using this tablespace for their
     546              :      * temporary files. We only care about current users because
     547              :      * temp_tablespace parameter will just ignore tablespaces that no longer
     548              :      * exist.
     549              :      *
     550              :      * Ask everybody to cancel their queries immediately so we can ensure no
     551              :      * temp files remain and we can remove the tablespace. Nuke the entire
     552              :      * site from orbit, it's the only way to be sure.
     553              :      *
     554              :      * XXX: We could work out the pids of active backends using this
     555              :      * tablespace by examining the temp filenames in the directory. We would
     556              :      * then convert the pids into VirtualXIDs before attempting to cancel
     557              :      * them.
     558              :      *
     559              :      * We don't wait for commit because drop tablespace is non-transactional.
     560              :      */
     561            1 :     temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
     562              :                                                 InvalidOid);
     563            1 :     ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
     564              :                                            RECOVERY_CONFLICT_TABLESPACE,
     565              :                                            WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
     566              :                                            true);
     567            1 : }
     568              : 
     569              : void
     570           14 : ResolveRecoveryConflictWithDatabase(Oid dbid)
     571              : {
     572              :     /*
     573              :      * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
     574              :      * only waits for transactions and completely idle sessions would block
     575              :      * us. This is rare enough that we do this as simply as possible: no wait,
     576              :      * just force them off immediately.
     577              :      *
     578              :      * No locking is required here because we already acquired
     579              :      * AccessExclusiveLock. Anybody trying to connect while we do this will
     580              :      * block during InitPostgres() and then disconnect when they see the
     581              :      * database has been removed.
     582              :      */
     583           16 :     while (CountDBBackends(dbid) > 0)
     584              :     {
     585            2 :         SignalRecoveryConflictWithDatabase(dbid, RECOVERY_CONFLICT_DATABASE);
     586              : 
     587              :         /*
     588              :          * Wait awhile for them to die so that we avoid flooding an
     589              :          * unresponsive backend when system is heavily loaded.
     590              :          */
     591            2 :         pg_usleep(10000);
     592              :     }
     593           14 : }
     594              : 
     595              : /*
     596              :  * ResolveRecoveryConflictWithLock is called from ProcSleep()
     597              :  * to resolve conflicts with other backends holding relation locks.
     598              :  *
     599              :  * The WaitLatch sleep normally done in ProcSleep()
     600              :  * (when not InHotStandby) is performed here, for code clarity.
     601              :  *
     602              :  * We either resolve conflicts immediately or set a timeout to wake us at
     603              :  * the limit of our patience.
     604              :  *
     605              :  * Resolve conflicts by canceling to all backends holding a conflicting
     606              :  * lock.  As we are already queued to be granted the lock, no new lock
     607              :  * requests conflicting with ours will be granted in the meantime.
     608              :  *
     609              :  * We also must check for deadlocks involving the Startup process and
     610              :  * hot-standby backend processes. If deadlock_timeout is reached in
     611              :  * this function, all the backends holding the conflicting locks are
     612              :  * requested to check themselves for deadlocks.
     613              :  *
     614              :  * logging_conflict should be true if the recovery conflict has not been
     615              :  * logged yet even though logging is enabled. After deadlock_timeout is
     616              :  * reached and the request for deadlock check is sent, we wait again to
     617              :  * be signaled by the release of the lock if logging_conflict is false.
     618              :  * Otherwise we return without waiting again so that the caller can report
     619              :  * the recovery conflict. In this case, then, this function is called again
     620              :  * with logging_conflict=false (because the recovery conflict has already
     621              :  * been logged) and we will wait again for the lock to be released.
     622              :  */
     623              : void
     624            4 : ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
     625              : {
     626              :     TimestampTz ltime;
     627              :     TimestampTz now;
     628              : 
     629              :     Assert(InHotStandby);
     630              : 
     631            4 :     ltime = GetStandbyLimitTime();
     632            4 :     now = GetCurrentTimestamp();
     633              : 
     634              :     /*
     635              :      * Update waitStart if first time through after the startup process
     636              :      * started waiting for the lock. It should not be updated every time
     637              :      * ResolveRecoveryConflictWithLock() is called during the wait.
     638              :      *
     639              :      * Use the current time obtained for comparison with ltime as waitStart
     640              :      * (i.e., the time when this process started waiting for the lock). Since
     641              :      * getting the current time newly can cause overhead, we reuse the
     642              :      * already-obtained time to avoid that overhead.
     643              :      *
     644              :      * Note that waitStart is updated without holding the lock table's
     645              :      * partition lock, to avoid the overhead by additional lock acquisition.
     646              :      * This can cause "waitstart" in pg_locks to become NULL for a very short
     647              :      * period of time after the wait started even though "granted" is false.
     648              :      * This is OK in practice because we can assume that users are likely to
     649              :      * look at "waitstart" when waiting for the lock for a long time.
     650              :      */
     651            4 :     if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
     652            1 :         pg_atomic_write_u64(&MyProc->waitStart, now);
     653              : 
     654            4 :     if (now >= ltime && ltime != 0)
     655            1 :     {
     656              :         /*
     657              :          * We're already behind, so clear a path as quickly as possible.
     658              :          */
     659              :         VirtualTransactionId *backends;
     660              : 
     661            1 :         backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
     662              : 
     663              :         /*
     664              :          * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
     665              :          * "waiting" in PS display by disabling its argument report_waiting
     666              :          * because the caller, WaitOnLock(), has already reported that.
     667              :          */
     668            1 :         ResolveRecoveryConflictWithVirtualXIDs(backends,
     669              :                                                RECOVERY_CONFLICT_LOCK,
     670            1 :                                                PG_WAIT_LOCK | locktag.locktag_type,
     671              :                                                false);
     672              :     }
     673              :     else
     674              :     {
     675              :         /*
     676              :          * Wait (or wait again) until ltime, and check for deadlocks as well
     677              :          * if we will be waiting longer than deadlock_timeout
     678              :          */
     679              :         EnableTimeoutParams timeouts[2];
     680            3 :         int         cnt = 0;
     681              : 
     682            3 :         if (ltime != 0)
     683              :         {
     684            3 :             got_standby_lock_timeout = false;
     685            3 :             timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
     686            3 :             timeouts[cnt].type = TMPARAM_AT;
     687            3 :             timeouts[cnt].fin_time = ltime;
     688            3 :             cnt++;
     689              :         }
     690              : 
     691            3 :         got_standby_deadlock_timeout = false;
     692            3 :         timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
     693            3 :         timeouts[cnt].type = TMPARAM_AFTER;
     694            3 :         timeouts[cnt].delay_ms = DeadlockTimeout;
     695            3 :         cnt++;
     696              : 
     697            3 :         enable_timeouts(timeouts, cnt);
     698              :     }
     699              : 
     700              :     /* Wait to be signaled by the release of the Relation Lock */
     701            4 :     ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
     702              : 
     703              :     /*
     704              :      * Exit if ltime is reached. Then all the backends holding conflicting
     705              :      * locks will be canceled in the next ResolveRecoveryConflictWithLock()
     706              :      * call.
     707              :      */
     708            4 :     if (got_standby_lock_timeout)
     709            0 :         goto cleanup;
     710              : 
     711            4 :     if (got_standby_deadlock_timeout)
     712              :     {
     713              :         VirtualTransactionId *backends;
     714              : 
     715            2 :         backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
     716              : 
     717              :         /* Quick exit if there's no work to be done */
     718            2 :         if (!VirtualTransactionIdIsValid(*backends))
     719            0 :             goto cleanup;
     720              : 
     721              :         /*
     722              :          * Send signals to all the backends holding the conflicting locks, to
     723              :          * ask them to check themselves for deadlocks.
     724              :          */
     725            4 :         while (VirtualTransactionIdIsValid(*backends))
     726              :         {
     727            2 :             (void) SignalRecoveryConflictWithVirtualXID(*backends,
     728              :                                                         RECOVERY_CONFLICT_STARTUP_DEADLOCK);
     729            2 :             backends++;
     730              :         }
     731              : 
     732              :         /*
     733              :          * Exit if the recovery conflict has not been logged yet even though
     734              :          * logging is enabled, so that the caller can log that. Then
     735              :          * RecoveryConflictWithLock() is called again and we will wait again
     736              :          * for the lock to be released.
     737              :          */
     738            2 :         if (logging_conflict)
     739            1 :             goto cleanup;
     740              : 
     741              :         /*
     742              :          * Wait again here to be signaled by the release of the Relation Lock,
     743              :          * to prevent the subsequent RecoveryConflictWithLock() from causing
     744              :          * deadlock_timeout and sending a request for deadlocks check again.
     745              :          * Otherwise the request continues to be sent every deadlock_timeout
     746              :          * until the relation locks are released or ltime is reached.
     747              :          */
     748            1 :         got_standby_deadlock_timeout = false;
     749            1 :         ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
     750              :     }
     751              : 
     752            2 : cleanup:
     753              : 
     754              :     /*
     755              :      * Clear any timeout requests established above.  We assume here that the
     756              :      * Startup process doesn't have any other outstanding timeouts than those
     757              :      * used by this function. If that stops being true, we could cancel the
     758              :      * timeouts individually, but that'd be slower.
     759              :      */
     760            4 :     disable_all_timeouts(false);
     761            4 :     got_standby_lock_timeout = false;
     762            4 :     got_standby_deadlock_timeout = false;
     763            4 : }
     764              : 
     765              : /*
     766              :  * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
     767              :  * to resolve conflicts with other backends holding buffer pins.
     768              :  *
     769              :  * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
     770              :  * (when not InHotStandby) is performed here, for code clarity.
     771              :  *
     772              :  * We either resolve conflicts immediately or set a timeout to wake us at
     773              :  * the limit of our patience.
     774              :  *
     775              :  * Resolve conflicts by sending a PROCSIG signal to all backends to check if
     776              :  * they hold one of the buffer pins that is blocking Startup process. If so,
     777              :  * those backends will take an appropriate error action, ERROR or FATAL.
     778              :  *
     779              :  * We also must check for deadlocks.  Deadlocks occur because if queries
     780              :  * wait on a lock, that must be behind an AccessExclusiveLock, which can only
     781              :  * be cleared if the Startup process replays a transaction completion record.
     782              :  * If Startup process is also waiting then that is a deadlock. The deadlock
     783              :  * can occur if the query is waiting and then the Startup sleeps, or if
     784              :  * Startup is sleeping and the query waits on a lock. We protect against
     785              :  * only the former sequence here, the latter sequence is checked prior to
     786              :  * the query sleeping, in CheckRecoveryConflictDeadlock().
     787              :  *
     788              :  * Deadlocks are extremely rare, and relatively expensive to check for,
     789              :  * so we don't do a deadlock check right away ... only if we have had to wait
     790              :  * at least deadlock_timeout.
     791              :  */
     792              : void
     793            9 : ResolveRecoveryConflictWithBufferPin(void)
     794              : {
     795              :     TimestampTz ltime;
     796              : 
     797              :     Assert(InHotStandby);
     798              : 
     799            9 :     ltime = GetStandbyLimitTime();
     800              : 
     801            9 :     if (GetCurrentTimestamp() >= ltime && ltime != 0)
     802              :     {
     803              :         /*
     804              :          * We're already behind, so clear a path as quickly as possible.
     805              :          */
     806            1 :         SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
     807              :     }
     808              :     else
     809              :     {
     810              :         /*
     811              :          * Wake up at ltime, and check for deadlocks as well if we will be
     812              :          * waiting longer than deadlock_timeout
     813              :          */
     814              :         EnableTimeoutParams timeouts[2];
     815            8 :         int         cnt = 0;
     816              : 
     817            8 :         if (ltime != 0)
     818              :         {
     819            8 :             timeouts[cnt].id = STANDBY_TIMEOUT;
     820            8 :             timeouts[cnt].type = TMPARAM_AT;
     821            8 :             timeouts[cnt].fin_time = ltime;
     822            8 :             cnt++;
     823              :         }
     824              : 
     825            8 :         got_standby_deadlock_timeout = false;
     826            8 :         timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
     827            8 :         timeouts[cnt].type = TMPARAM_AFTER;
     828            8 :         timeouts[cnt].delay_ms = DeadlockTimeout;
     829            8 :         cnt++;
     830              : 
     831            8 :         enable_timeouts(timeouts, cnt);
     832              :     }
     833              : 
     834              :     /*
     835              :      * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
     836              :      * by one of the timeouts established above.
     837              :      *
     838              :      * We assume that only UnpinBuffer() and the timeout requests established
     839              :      * above can wake us up here. WakeupRecovery() called by walreceiver or
     840              :      * SIGHUP signal handler, etc cannot do that because it uses the different
     841              :      * latch from that ProcWaitForSignal() waits on.
     842              :      */
     843            9 :     ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
     844              : 
     845            9 :     if (got_standby_delay_timeout)
     846            1 :         SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
     847            8 :     else if (got_standby_deadlock_timeout)
     848              :     {
     849              :         /*
     850              :          * Send out a request for hot-standby backends to check themselves for
     851              :          * deadlocks.
     852              :          *
     853              :          * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
     854              :          * to be signaled by UnpinBuffer() again and send a request for
     855              :          * deadlocks check if deadlock_timeout happens. This causes the
     856              :          * request to continue to be sent every deadlock_timeout until the
     857              :          * buffer is unpinned or ltime is reached. This would increase the
     858              :          * workload in the startup process and backends. In practice it may
     859              :          * not be so harmful because the period that the buffer is kept pinned
     860              :          * is basically no so long. But we should fix this?
     861              :          */
     862            5 :         SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
     863              :     }
     864              : 
     865              :     /*
     866              :      * Clear any timeout requests established above.  We assume here that the
     867              :      * Startup process doesn't have any other timeouts than what this function
     868              :      * uses.  If that stops being true, we could cancel the timeouts
     869              :      * individually, but that'd be slower.
     870              :      */
     871            9 :     disable_all_timeouts(false);
     872            9 :     got_standby_delay_timeout = false;
     873            9 :     got_standby_deadlock_timeout = false;
     874            9 : }
     875              : 
     876              : static void
     877            7 : SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason)
     878              : {
     879              :     Assert(reason == RECOVERY_CONFLICT_BUFFERPIN ||
     880              :            reason == RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
     881              : 
     882              :     /*
     883              :      * We send signal to all backends to ask them if they are holding the
     884              :      * buffer pin which is delaying the Startup process. Most of them will be
     885              :      * innocent, but we let the SIGUSR1 handling in each backend decide their
     886              :      * own fate.
     887              :      */
     888            7 :     SignalRecoveryConflictWithDatabase(InvalidOid, reason);
     889            7 : }
     890              : 
     891              : /*
     892              :  * In Hot Standby perform early deadlock detection.  We abort the lock
     893              :  * wait if we are about to sleep while holding the buffer pin that Startup
     894              :  * process is waiting for.
     895              :  *
     896              :  * Note: this code is pessimistic, because there is no way for it to
     897              :  * determine whether an actual deadlock condition is present: the lock we
     898              :  * need to wait for might be unrelated to any held by the Startup process.
     899              :  * Sooner or later, this mechanism should get ripped out in favor of somehow
     900              :  * accounting for buffer locks in DeadLockCheck().  However, errors here
     901              :  * seem to be very low-probability in practice, so for now it's not worth
     902              :  * the trouble.
     903              :  */
     904              : void
     905            1 : CheckRecoveryConflictDeadlock(void)
     906              : {
     907              :     Assert(!InRecovery);        /* do not call in Startup process */
     908              : 
     909            1 :     if (!HoldingBufferPinThatDelaysRecovery())
     910            1 :         return;
     911              : 
     912              :     /*
     913              :      * Error message should match ProcessInterrupts() but we avoid calling
     914              :      * that because we aren't handling an interrupt at this point. Note that
     915              :      * we only cancel the current transaction here, so if we are in a
     916              :      * subtransaction and the pin is held by a parent, then the Startup
     917              :      * process will continue to wait even though we have avoided deadlock.
     918              :      */
     919            0 :     ereport(ERROR,
     920              :             (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
     921              :              errmsg("canceling statement due to conflict with recovery"),
     922              :              errdetail("User transaction caused buffer deadlock with recovery.")));
     923              : }
     924              : 
     925              : 
     926              : /* --------------------------------
     927              :  *      timeout handler routines
     928              :  * --------------------------------
     929              :  */
     930              : 
     931              : /*
     932              :  * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
     933              :  * exceeded.
     934              :  */
     935              : void
     936            7 : StandbyDeadLockHandler(void)
     937              : {
     938            7 :     got_standby_deadlock_timeout = true;
     939            7 : }
     940              : 
     941              : /*
     942              :  * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
     943              :  */
     944              : void
     945            1 : StandbyTimeoutHandler(void)
     946              : {
     947            1 :     got_standby_delay_timeout = true;
     948            1 : }
     949              : 
     950              : /*
     951              :  * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
     952              :  */
     953              : void
     954            1 : StandbyLockTimeoutHandler(void)
     955              : {
     956            1 :     got_standby_lock_timeout = true;
     957            1 : }
     958              : 
     959              : /*
     960              :  * -----------------------------------------------------
     961              :  * Locking in Recovery Mode
     962              :  * -----------------------------------------------------
     963              :  *
     964              :  * All locks are held by the Startup process using a single virtual
     965              :  * transaction. This implementation is both simpler and in some senses,
     966              :  * more correct. The locks held mean "some original transaction held
     967              :  * this lock, so query access is not allowed at this time". So the Startup
     968              :  * process is the proxy by which the original locks are implemented.
     969              :  *
     970              :  * We only keep track of AccessExclusiveLocks, which are only ever held by
     971              :  * one transaction on one relation.
     972              :  *
     973              :  * We keep a table of known locks in the RecoveryLockHash hash table.
     974              :  * The point of that table is to let us efficiently de-duplicate locks,
     975              :  * which is important because checkpoints will re-report the same locks
     976              :  * already held.  There is also a RecoveryLockXidHash table with one entry
     977              :  * per xid, which allows us to efficiently find all the locks held by a
     978              :  * given original transaction.
     979              :  *
     980              :  * We use session locks rather than normal locks so we don't need
     981              :  * ResourceOwners.
     982              :  */
     983              : 
     984              : 
     985              : void
     986        27456 : StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
     987              : {
     988              :     RecoveryLockXidEntry *xidentry;
     989              :     RecoveryLockEntry *lockentry;
     990              :     xl_standby_lock key;
     991              :     LOCKTAG     locktag;
     992              :     bool        found;
     993              : 
     994              :     /* Already processed? */
     995        54912 :     if (!TransactionIdIsValid(xid) ||
     996        54862 :         TransactionIdDidCommit(xid) ||
     997        27406 :         TransactionIdDidAbort(xid))
     998           50 :         return;
     999              : 
    1000        27406 :     elog(DEBUG4, "adding recovery lock: db %u rel %u", dbOid, relOid);
    1001              : 
    1002              :     /* dbOid is InvalidOid when we are locking a shared relation. */
    1003              :     Assert(OidIsValid(relOid));
    1004              : 
    1005              :     /* Create a hash entry for this xid, if we don't have one already. */
    1006        27406 :     xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
    1007        27406 :     if (!found)
    1008              :     {
    1009              :         Assert(xidentry->xid == xid);    /* dynahash should have set this */
    1010        11065 :         xidentry->head = NULL;
    1011              :     }
    1012              : 
    1013              :     /* Create a hash entry for this lock, unless we have one already. */
    1014        27406 :     key.xid = xid;
    1015        27406 :     key.dbOid = dbOid;
    1016        27406 :     key.relOid = relOid;
    1017        27406 :     lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
    1018        27406 :     if (!found)
    1019              :     {
    1020              :         /* It's new, so link it into the XID's list ... */
    1021        25941 :         lockentry->next = xidentry->head;
    1022        25941 :         xidentry->head = lockentry;
    1023              : 
    1024              :         /* ... and acquire the lock locally. */
    1025        25941 :         SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
    1026              : 
    1027        25941 :         (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
    1028              :     }
    1029              : }
    1030              : 
    1031              : /*
    1032              :  * Release all the locks associated with this RecoveryLockXidEntry.
    1033              :  */
    1034              : static void
    1035        11065 : StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
    1036              : {
    1037              :     RecoveryLockEntry *entry;
    1038              :     RecoveryLockEntry *next;
    1039              : 
    1040        37006 :     for (entry = xidentry->head; entry != NULL; entry = next)
    1041              :     {
    1042              :         LOCKTAG     locktag;
    1043              : 
    1044        25941 :         elog(DEBUG4,
    1045              :              "releasing recovery lock: xid %u db %u rel %u",
    1046              :              entry->key.xid, entry->key.dbOid, entry->key.relOid);
    1047              :         /* Release the lock ... */
    1048        25941 :         SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
    1049        25941 :         if (!LockRelease(&locktag, AccessExclusiveLock, true))
    1050              :         {
    1051            0 :             elog(LOG,
    1052              :                  "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
    1053              :                  entry->key.xid, entry->key.dbOid, entry->key.relOid);
    1054              :             Assert(false);
    1055              :         }
    1056              :         /* ... and remove the per-lock hash entry */
    1057        25941 :         next = entry->next;
    1058        25941 :         hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
    1059              :     }
    1060              : 
    1061        11065 :     xidentry->head = NULL;       /* just for paranoia */
    1062        11065 : }
    1063              : 
    1064              : /*
    1065              :  * Release locks for specific XID, or all locks if it's InvalidXid.
    1066              :  */
    1067              : static void
    1068        11749 : StandbyReleaseLocks(TransactionId xid)
    1069              : {
    1070              :     RecoveryLockXidEntry *entry;
    1071              : 
    1072        11749 :     if (TransactionIdIsValid(xid))
    1073              :     {
    1074        11749 :         if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
    1075              :         {
    1076        11065 :             StandbyReleaseXidEntryLocks(entry);
    1077        11065 :             hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1078              :         }
    1079              :     }
    1080              :     else
    1081            0 :         StandbyReleaseAllLocks();
    1082        11749 : }
    1083              : 
    1084              : /*
    1085              :  * Release locks for a transaction tree, starting at xid down, from
    1086              :  * RecoveryLockXidHash.
    1087              :  *
    1088              :  * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
    1089              :  * to remove any AccessExclusiveLocks requested by a transaction.
    1090              :  */
    1091              : void
    1092        11246 : StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
    1093              : {
    1094              :     int         i;
    1095              : 
    1096        11246 :     StandbyReleaseLocks(xid);
    1097              : 
    1098        11749 :     for (i = 0; i < nsubxids; i++)
    1099          503 :         StandbyReleaseLocks(subxids[i]);
    1100        11246 : }
    1101              : 
    1102              : /*
    1103              :  * Called at end of recovery and when we see a shutdown checkpoint.
    1104              :  */
    1105              : void
    1106          114 : StandbyReleaseAllLocks(void)
    1107              : {
    1108              :     HASH_SEQ_STATUS status;
    1109              :     RecoveryLockXidEntry *entry;
    1110              : 
    1111          114 :     elog(DEBUG2, "release all standby locks");
    1112              : 
    1113          114 :     hash_seq_init(&status, RecoveryLockXidHash);
    1114          114 :     while ((entry = hash_seq_search(&status)))
    1115              :     {
    1116            0 :         StandbyReleaseXidEntryLocks(entry);
    1117            0 :         hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1118              :     }
    1119          114 : }
    1120              : 
    1121              : /*
    1122              :  * StandbyReleaseOldLocks
    1123              :  *      Release standby locks held by top-level XIDs that aren't running,
    1124              :  *      as long as they're not prepared transactions.
    1125              :  *
    1126              :  * This is needed to prune the locks of crashed transactions, which didn't
    1127              :  * write an ABORT/COMMIT record.
    1128              :  */
    1129              : void
    1130          822 : StandbyReleaseOldLocks(TransactionId oldxid)
    1131              : {
    1132              :     HASH_SEQ_STATUS status;
    1133              :     RecoveryLockXidEntry *entry;
    1134              : 
    1135          822 :     hash_seq_init(&status, RecoveryLockXidHash);
    1136         1142 :     while ((entry = hash_seq_search(&status)))
    1137              :     {
    1138              :         Assert(TransactionIdIsValid(entry->xid));
    1139              : 
    1140              :         /* Skip if prepared transaction. */
    1141          320 :         if (StandbyTransactionIdIsPrepared(entry->xid))
    1142            0 :             continue;
    1143              : 
    1144              :         /* Skip if >= oldxid. */
    1145          320 :         if (!TransactionIdPrecedes(entry->xid, oldxid))
    1146          320 :             continue;
    1147              : 
    1148              :         /* Remove all locks and hash table entry. */
    1149            0 :         StandbyReleaseXidEntryLocks(entry);
    1150            0 :         hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1151              :     }
    1152          822 : }
    1153              : 
    1154              : /*
    1155              :  * --------------------------------------------------------------------
    1156              :  *      Recovery handling for Rmgr RM_STANDBY_ID
    1157              :  *
    1158              :  * These record types will only be created if XLogStandbyInfoActive()
    1159              :  * --------------------------------------------------------------------
    1160              :  */
    1161              : 
    1162              : void
    1163        27653 : standby_redo(XLogReaderState *record)
    1164              : {
    1165        27653 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    1166              : 
    1167              :     /* Backup blocks are not used in standby records */
    1168              :     Assert(!XLogRecHasAnyBlockRefs(record));
    1169              : 
    1170              :     /* Do nothing if we're not in hot standby mode */
    1171        27653 :     if (standbyState == STANDBY_DISABLED)
    1172          154 :         return;
    1173              : 
    1174        27499 :     if (info == XLOG_STANDBY_LOCK)
    1175              :     {
    1176        26114 :         xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
    1177              :         int         i;
    1178              : 
    1179        53570 :         for (i = 0; i < xlrec->nlocks; i++)
    1180        27456 :             StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
    1181              :                                               xlrec->locks[i].dbOid,
    1182              :                                               xlrec->locks[i].relOid);
    1183              :     }
    1184         1385 :     else if (info == XLOG_RUNNING_XACTS)
    1185              :     {
    1186          760 :         xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
    1187              :         RunningTransactionsData running;
    1188              : 
    1189          760 :         running.xcnt = xlrec->xcnt;
    1190          760 :         running.subxcnt = xlrec->subxcnt;
    1191          760 :         running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
    1192          760 :         running.nextXid = xlrec->nextXid;
    1193          760 :         running.latestCompletedXid = xlrec->latestCompletedXid;
    1194          760 :         running.oldestRunningXid = xlrec->oldestRunningXid;
    1195          760 :         running.xids = xlrec->xids;
    1196              : 
    1197          760 :         ProcArrayApplyRecoveryInfo(&running);
    1198              : 
    1199              :         /*
    1200              :          * The startup process currently has no convenient way to schedule
    1201              :          * stats to be reported. XLOG_RUNNING_XACTS records issued at a
    1202              :          * regular cadence, making this a convenient location to report stats.
    1203              :          * While these records aren't generated with wal_level=minimal, stats
    1204              :          * also cannot be accessed during WAL replay.
    1205              :          */
    1206          760 :         pgstat_report_stat(true);
    1207              :     }
    1208          625 :     else if (info == XLOG_INVALIDATIONS)
    1209              :     {
    1210          625 :         xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
    1211              : 
    1212          625 :         ProcessCommittedInvalidationMessages(xlrec->msgs,
    1213              :                                              xlrec->nmsgs,
    1214          625 :                                              xlrec->relcacheInitFileInval,
    1215              :                                              xlrec->dbId,
    1216              :                                              xlrec->tsId);
    1217              :     }
    1218              :     else
    1219            0 :         elog(PANIC, "standby_redo: unknown op code %u", info);
    1220              : }
    1221              : 
    1222              : /*
    1223              :  * Log details of the current snapshot to WAL. This allows the snapshot state
    1224              :  * to be reconstructed on the standby and for logical decoding.
    1225              :  *
    1226              :  * This is used for Hot Standby as follows:
    1227              :  *
    1228              :  * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
    1229              :  * start from a shutdown checkpoint because we know nothing was running
    1230              :  * at that time and our recovery snapshot is known empty. In the more
    1231              :  * typical case of an online checkpoint we need to jump through a few
    1232              :  * hoops to get a correct recovery snapshot and this requires a two or
    1233              :  * sometimes a three stage process.
    1234              :  *
    1235              :  * The initial snapshot must contain all running xids and all current
    1236              :  * AccessExclusiveLocks at a point in time on the standby. Assembling
    1237              :  * that information while the server is running requires many and
    1238              :  * various LWLocks, so we choose to derive that information piece by
    1239              :  * piece and then re-assemble that info on the standby. When that
    1240              :  * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
    1241              :  *
    1242              :  * Since locking on the primary when we derive the information is not
    1243              :  * strict, we note that there is a time window between the derivation and
    1244              :  * writing to WAL of the derived information. That allows race conditions
    1245              :  * that we must resolve, since xids and locks may enter or leave the
    1246              :  * snapshot during that window. This creates the issue that an xid or
    1247              :  * lock may start *after* the snapshot has been derived yet *before* the
    1248              :  * snapshot is logged in the running xacts WAL record. We resolve this by
    1249              :  * starting to accumulate changes at a point just prior to when we derive
    1250              :  * the snapshot on the primary, then ignore duplicates when we later apply
    1251              :  * the snapshot from the running xacts record. This is implemented during
    1252              :  * CreateCheckPoint() where we use the logical checkpoint location as
    1253              :  * our starting point and then write the running xacts record immediately
    1254              :  * before writing the main checkpoint WAL record. Since we always start
    1255              :  * up from a checkpoint and are immediately at our starting point, we
    1256              :  * unconditionally move to STANDBY_INITIALIZED. After this point we
    1257              :  * must do 4 things:
    1258              :  *  * move shared nextXid forwards as we see new xids
    1259              :  *  * extend the clog and subtrans with each new xid
    1260              :  *  * keep track of uncommitted known assigned xids
    1261              :  *  * keep track of uncommitted AccessExclusiveLocks
    1262              :  *
    1263              :  * When we see a commit/abort we must remove known assigned xids and locks
    1264              :  * from the completing transaction. Attempted removals that cannot locate
    1265              :  * an entry are expected and must not cause an error when we are in state
    1266              :  * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
    1267              :  * KnownAssignedXidsRemove().
    1268              :  *
    1269              :  * Later, when we apply the running xact data we must be careful to ignore
    1270              :  * transactions already committed, since those commits raced ahead when
    1271              :  * making WAL entries.
    1272              :  *
    1273              :  * For logical decoding only the running xacts information is needed;
    1274              :  * there's no need to look at the locking information, but it's logged anyway,
    1275              :  * as there's no independent knob to just enable logical decoding. For
    1276              :  * details of how this is used, check snapbuild.c's introductory comment.
    1277              :  *
    1278              :  *
    1279              :  * Returns the RecPtr of the last inserted record.
    1280              :  */
    1281              : XLogRecPtr
    1282         1441 : LogStandbySnapshot(void)
    1283              : {
    1284              :     XLogRecPtr  recptr;
    1285              :     RunningTransactions running;
    1286              :     xl_standby_lock *locks;
    1287              :     int         nlocks;
    1288         1441 :     bool        logical_decoding_enabled = IsLogicalDecodingEnabled();
    1289              : 
    1290              :     Assert(XLogStandbyInfoActive());
    1291              : 
    1292              : #ifdef USE_INJECTION_POINTS
    1293         1441 :     if (IS_INJECTION_POINT_ATTACHED("skip-log-running-xacts"))
    1294              :     {
    1295              :         /*
    1296              :          * This record could move slot's xmin forward during decoding, leading
    1297              :          * to unpredictable results, so skip it when requested by the test.
    1298              :          */
    1299            0 :         return GetInsertRecPtr();
    1300              :     }
    1301              : #endif
    1302              : 
    1303              :     /*
    1304              :      * Get details of any AccessExclusiveLocks being held at the moment.
    1305              :      */
    1306         1441 :     locks = GetRunningTransactionLocks(&nlocks);
    1307         1441 :     if (nlocks > 0)
    1308          170 :         LogAccessExclusiveLocks(nlocks, locks);
    1309         1441 :     pfree(locks);
    1310              : 
    1311              :     /*
    1312              :      * Log details of all in-progress transactions. This should be the last
    1313              :      * record we write, because standby will open up when it sees this.
    1314              :      */
    1315         1441 :     running = GetRunningTransactionData();
    1316              : 
    1317              :     /*
    1318              :      * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
    1319              :      * For Hot Standby this can be done before inserting the WAL record
    1320              :      * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
    1321              :      * the clog. For logical decoding, though, the lock can't be released
    1322              :      * early because the clog might be "in the future" from the POV of the
    1323              :      * historic snapshot. This would allow for situations where we're waiting
    1324              :      * for the end of a transaction listed in the xl_running_xacts record
    1325              :      * which, according to the WAL, has committed before the xl_running_xacts
    1326              :      * record. Fortunately this routine isn't executed frequently, and it's
    1327              :      * only a shared lock.
    1328              :      */
    1329         1441 :     if (!logical_decoding_enabled)
    1330          909 :         LWLockRelease(ProcArrayLock);
    1331              : 
    1332         1441 :     recptr = LogCurrentRunningXacts(running);
    1333              : 
    1334              :     /* Release lock if we kept it longer ... */
    1335         1441 :     if (logical_decoding_enabled)
    1336          532 :         LWLockRelease(ProcArrayLock);
    1337              : 
    1338              :     /* GetRunningTransactionData() acquired XidGenLock, we must release it */
    1339         1441 :     LWLockRelease(XidGenLock);
    1340              : 
    1341         1441 :     return recptr;
    1342              : }
    1343              : 
    1344              : /*
    1345              :  * Record an enhanced snapshot of running transactions into WAL.
    1346              :  *
    1347              :  * The definitions of RunningTransactionsData and xl_running_xacts are
    1348              :  * similar. We keep them separate because xl_running_xacts is a contiguous
    1349              :  * chunk of memory and never exists fully until it is assembled in WAL.
    1350              :  * The inserted records are marked as not being important for durability,
    1351              :  * to avoid triggering superfluous checkpoint / archiving activity.
    1352              :  */
    1353              : static XLogRecPtr
    1354         1441 : LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
    1355              : {
    1356              :     xl_running_xacts xlrec;
    1357              :     XLogRecPtr  recptr;
    1358              : 
    1359         1441 :     xlrec.xcnt = CurrRunningXacts->xcnt;
    1360         1441 :     xlrec.subxcnt = CurrRunningXacts->subxcnt;
    1361         1441 :     xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY);
    1362         1441 :     xlrec.nextXid = CurrRunningXacts->nextXid;
    1363         1441 :     xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
    1364         1441 :     xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
    1365              : 
    1366              :     /* Header */
    1367         1441 :     XLogBeginInsert();
    1368         1441 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    1369         1441 :     XLogRegisterData(&xlrec, MinSizeOfXactRunningXacts);
    1370              : 
    1371              :     /* array of TransactionIds */
    1372         1441 :     if (xlrec.xcnt > 0)
    1373          479 :         XLogRegisterData(CurrRunningXacts->xids,
    1374          479 :                          (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
    1375              : 
    1376         1441 :     recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
    1377              : 
    1378         1441 :     if (xlrec.subxid_overflow)
    1379            2 :         elog(DEBUG2,
    1380              :              "snapshot of %d running transactions overflowed (lsn %X/%08X oldest xid %u latest complete %u next xid %u)",
    1381              :              CurrRunningXacts->xcnt,
    1382              :              LSN_FORMAT_ARGS(recptr),
    1383              :              CurrRunningXacts->oldestRunningXid,
    1384              :              CurrRunningXacts->latestCompletedXid,
    1385              :              CurrRunningXacts->nextXid);
    1386              :     else
    1387         1439 :         elog(DEBUG2,
    1388              :              "snapshot of %d+%d running transaction ids (lsn %X/%08X oldest xid %u latest complete %u next xid %u)",
    1389              :              CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
    1390              :              LSN_FORMAT_ARGS(recptr),
    1391              :              CurrRunningXacts->oldestRunningXid,
    1392              :              CurrRunningXacts->latestCompletedXid,
    1393              :              CurrRunningXacts->nextXid);
    1394              : 
    1395              :     /*
    1396              :      * Ensure running_xacts information is synced to disk not too far in the
    1397              :      * future. We don't want to stall anything though (i.e. use XLogFlush()),
    1398              :      * so we let the wal writer do it during normal operation.
    1399              :      * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
    1400              :      * and nudge the WALWriter into action if sleeping. Check
    1401              :      * XLogBackgroundFlush() for details why a record might not be flushed
    1402              :      * without it.
    1403              :      */
    1404         1441 :     XLogSetAsyncXactLSN(recptr);
    1405              : 
    1406         1441 :     return recptr;
    1407              : }
    1408              : 
    1409              : /*
    1410              :  * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
    1411              :  * logged, as described in backend/storage/lmgr/README.
    1412              :  */
    1413              : static void
    1414       132911 : LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
    1415              : {
    1416              :     xl_standby_locks xlrec;
    1417              : 
    1418       132911 :     xlrec.nlocks = nlocks;
    1419              : 
    1420       132911 :     XLogBeginInsert();
    1421       132911 :     XLogRegisterData(&xlrec, offsetof(xl_standby_locks, locks));
    1422       132911 :     XLogRegisterData(locks, nlocks * sizeof(xl_standby_lock));
    1423       132911 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    1424              : 
    1425       132911 :     (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
    1426       132911 : }
    1427              : 
    1428              : /*
    1429              :  * Individual logging of AccessExclusiveLocks for use during LockAcquire()
    1430              :  */
    1431              : void
    1432       132741 : LogAccessExclusiveLock(Oid dbOid, Oid relOid)
    1433              : {
    1434              :     xl_standby_lock xlrec;
    1435              : 
    1436       132741 :     xlrec.xid = GetCurrentTransactionId();
    1437              : 
    1438       132741 :     xlrec.dbOid = dbOid;
    1439       132741 :     xlrec.relOid = relOid;
    1440              : 
    1441       132741 :     LogAccessExclusiveLocks(1, &xlrec);
    1442       132741 :     MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
    1443       132741 : }
    1444              : 
    1445              : /*
    1446              :  * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
    1447              :  */
    1448              : void
    1449       132956 : LogAccessExclusiveLockPrepare(void)
    1450              : {
    1451              :     /*
    1452              :      * Ensure that a TransactionId has been assigned to this transaction, for
    1453              :      * two reasons, both related to lock release on the standby. First, we
    1454              :      * must assign an xid so that RecordTransactionCommit() and
    1455              :      * RecordTransactionAbort() do not optimise away the transaction
    1456              :      * completion record which recovery relies upon to release locks. It's a
    1457              :      * hack, but for a corner case not worth adding code for into the main
    1458              :      * commit path. Second, we must assign an xid before the lock is recorded
    1459              :      * in shared memory, otherwise a concurrently executing
    1460              :      * GetRunningTransactionLocks() might see a lock associated with an
    1461              :      * InvalidTransactionId which we later assert cannot happen.
    1462              :      */
    1463       132956 :     (void) GetCurrentTransactionId();
    1464       132956 : }
    1465              : 
    1466              : /*
    1467              :  * Emit WAL for invalidations. This currently is only used for commits without
    1468              :  * an xid but which contain invalidations.
    1469              :  */
    1470              : void
    1471         9587 : LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
    1472              :                         bool relcacheInitFileInval)
    1473              : {
    1474              :     xl_invalidations xlrec;
    1475              : 
    1476              :     /* prepare record */
    1477         9587 :     memset(&xlrec, 0, sizeof(xlrec));
    1478         9587 :     xlrec.dbId = MyDatabaseId;
    1479         9587 :     xlrec.tsId = MyDatabaseTableSpace;
    1480         9587 :     xlrec.relcacheInitFileInval = relcacheInitFileInval;
    1481         9587 :     xlrec.nmsgs = nmsgs;
    1482              : 
    1483              :     /* perform insertion */
    1484         9587 :     XLogBeginInsert();
    1485         9587 :     XLogRegisterData(&xlrec, MinSizeOfInvalidations);
    1486         9587 :     XLogRegisterData(msgs,
    1487              :                      nmsgs * sizeof(SharedInvalidationMessage));
    1488         9587 :     XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
    1489         9587 : }
    1490              : 
    1491              : /* Return the description of recovery conflict */
    1492              : static const char *
    1493           10 : get_recovery_conflict_desc(RecoveryConflictReason reason)
    1494              : {
    1495           10 :     const char *reasonDesc = _("unknown reason");
    1496              : 
    1497           10 :     switch (reason)
    1498              :     {
    1499            4 :         case RECOVERY_CONFLICT_BUFFERPIN:
    1500            4 :             reasonDesc = _("recovery conflict on buffer pin");
    1501            4 :             break;
    1502            2 :         case RECOVERY_CONFLICT_LOCK:
    1503            2 :             reasonDesc = _("recovery conflict on lock");
    1504            2 :             break;
    1505            2 :         case RECOVERY_CONFLICT_TABLESPACE:
    1506            2 :             reasonDesc = _("recovery conflict on tablespace");
    1507            2 :             break;
    1508            2 :         case RECOVERY_CONFLICT_SNAPSHOT:
    1509            2 :             reasonDesc = _("recovery conflict on snapshot");
    1510            2 :             break;
    1511            0 :         case RECOVERY_CONFLICT_LOGICALSLOT:
    1512            0 :             reasonDesc = _("recovery conflict on replication slot");
    1513            0 :             break;
    1514            0 :         case RECOVERY_CONFLICT_STARTUP_DEADLOCK:
    1515            0 :             reasonDesc = _("recovery conflict on deadlock");
    1516            0 :             break;
    1517            0 :         case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK:
    1518            0 :             reasonDesc = _("recovery conflict on buffer deadlock");
    1519            0 :             break;
    1520            0 :         case RECOVERY_CONFLICT_DATABASE:
    1521            0 :             reasonDesc = _("recovery conflict on database");
    1522            0 :             break;
    1523              :     }
    1524              : 
    1525           10 :     return reasonDesc;
    1526              : }

Generated by: LCOV version 2.0-1