LCOV - code coverage report
Current view: top level - src/backend/storage/ipc - standby.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 338 377 89.7 %
Date: 2024-04-19 01:11:28 Functions: 30 31 96.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * standby.c
       4             :  *    Misc functions used in Hot Standby mode.
       5             :  *
       6             :  *  All functions for handling RM_STANDBY_ID, which relate to
       7             :  *  AccessExclusiveLocks and starting snapshots for Hot Standby mode.
       8             :  *  Plus conflict recovery processing.
       9             :  *
      10             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
      11             :  * Portions Copyright (c) 1994, Regents of the University of California
      12             :  *
      13             :  * IDENTIFICATION
      14             :  *    src/backend/storage/ipc/standby.c
      15             :  *
      16             :  *-------------------------------------------------------------------------
      17             :  */
      18             : #include "postgres.h"
      19             : #include "access/transam.h"
      20             : #include "access/twophase.h"
      21             : #include "access/xact.h"
      22             : #include "access/xloginsert.h"
      23             : #include "access/xlogrecovery.h"
      24             : #include "access/xlogutils.h"
      25             : #include "miscadmin.h"
      26             : #include "pgstat.h"
      27             : #include "replication/slot.h"
      28             : #include "storage/bufmgr.h"
      29             : #include "storage/proc.h"
      30             : #include "storage/procarray.h"
      31             : #include "storage/sinvaladt.h"
      32             : #include "storage/standby.h"
      33             : #include "utils/hsearch.h"
      34             : #include "utils/ps_status.h"
      35             : #include "utils/timeout.h"
      36             : #include "utils/timestamp.h"
      37             : 
      38             : /* User-settable GUC parameters */
      39             : int         max_standby_archive_delay = 30 * 1000;
      40             : int         max_standby_streaming_delay = 30 * 1000;
      41             : bool        log_recovery_conflict_waits = false;
      42             : 
      43             : /*
      44             :  * Keep track of all the exclusive locks owned by original transactions.
      45             :  * For each known exclusive lock, there is a RecoveryLockEntry in the
      46             :  * RecoveryLockHash hash table.  All RecoveryLockEntrys belonging to a
      47             :  * given XID are chained together so that we can find them easily.
      48             :  * For each original transaction that is known to have any such locks,
      49             :  * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
      50             :  * which stores the head of the chain of its locks.
      51             :  */
      52             : typedef struct RecoveryLockEntry
      53             : {
      54             :     xl_standby_lock key;        /* hash key: xid, dbOid, relOid */
      55             :     struct RecoveryLockEntry *next; /* chain link */
      56             : } RecoveryLockEntry;
      57             : 
      58             : typedef struct RecoveryLockXidEntry
      59             : {
      60             :     TransactionId xid;          /* hash key -- must be first */
      61             :     struct RecoveryLockEntry *head; /* chain head */
      62             : } RecoveryLockXidEntry;
      63             : 
      64             : static HTAB *RecoveryLockHash = NULL;
      65             : static HTAB *RecoveryLockXidHash = NULL;
      66             : 
      67             : /* Flags set by timeout handlers */
      68             : static volatile sig_atomic_t got_standby_deadlock_timeout = false;
      69             : static volatile sig_atomic_t got_standby_delay_timeout = false;
      70             : static volatile sig_atomic_t got_standby_lock_timeout = false;
      71             : 
      72             : static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
      73             :                                                    ProcSignalReason reason,
      74             :                                                    uint32 wait_event_info,
      75             :                                                    bool report_waiting);
      76             : static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
      77             : static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
      78             : static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
      79             : static const char *get_recovery_conflict_desc(ProcSignalReason reason);
      80             : 
      81             : /*
      82             :  * InitRecoveryTransactionEnvironment
      83             :  *      Initialize tracking of our primary's in-progress transactions.
      84             :  *
      85             :  * We need to issue shared invalidations and hold locks. Holding locks
      86             :  * means others may want to wait on us, so we need to make a lock table
      87             :  * vxact entry like a real transaction. We could create and delete
      88             :  * lock table entries for each transaction but its simpler just to create
      89             :  * one permanent entry and leave it there all the time. Locks are then
      90             :  * acquired and released as needed. Yes, this means you can see the
      91             :  * Startup process in pg_locks once we have run this.
      92             :  */
      93             : void
      94         186 : InitRecoveryTransactionEnvironment(void)
      95             : {
      96             :     VirtualTransactionId vxid;
      97             :     HASHCTL     hash_ctl;
      98             : 
      99             :     Assert(RecoveryLockHash == NULL);   /* don't run this twice */
     100             : 
     101             :     /*
     102             :      * Initialize the hash tables for tracking the locks held by each
     103             :      * transaction.
     104             :      */
     105         186 :     hash_ctl.keysize = sizeof(xl_standby_lock);
     106         186 :     hash_ctl.entrysize = sizeof(RecoveryLockEntry);
     107         186 :     RecoveryLockHash = hash_create("RecoveryLockHash",
     108             :                                    64,
     109             :                                    &hash_ctl,
     110             :                                    HASH_ELEM | HASH_BLOBS);
     111         186 :     hash_ctl.keysize = sizeof(TransactionId);
     112         186 :     hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
     113         186 :     RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
     114             :                                       64,
     115             :                                       &hash_ctl,
     116             :                                       HASH_ELEM | HASH_BLOBS);
     117             : 
     118             :     /*
     119             :      * Initialize shared invalidation management for Startup process, being
     120             :      * careful to register ourselves as a sendOnly process so we don't need to
     121             :      * read messages, nor will we get signaled when the queue starts filling
     122             :      * up.
     123             :      */
     124         186 :     SharedInvalBackendInit(true);
     125             : 
     126             :     /*
     127             :      * Lock a virtual transaction id for Startup process.
     128             :      *
     129             :      * We need to do GetNextLocalTransactionId() because
     130             :      * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
     131             :      * manager doesn't like that at all.
     132             :      *
     133             :      * Note that we don't need to run XactLockTableInsert() because nobody
     134             :      * needs to wait on xids. That sounds a little strange, but table locks
     135             :      * are held by vxids and row level locks are held by xids. All queries
     136             :      * hold AccessShareLocks so never block while we write or lock new rows.
     137             :      */
     138         186 :     MyProc->vxid.procNumber = MyProcNumber;
     139         186 :     vxid.procNumber = MyProcNumber;
     140         186 :     vxid.localTransactionId = GetNextLocalTransactionId();
     141         186 :     VirtualXactLockTableInsert(vxid);
     142             : 
     143         186 :     standbyState = STANDBY_INITIALIZED;
     144         186 : }
     145             : 
     146             : /*
     147             :  * ShutdownRecoveryTransactionEnvironment
     148             :  *      Shut down transaction tracking
     149             :  *
     150             :  * Prepare to switch from hot standby mode to normal operation. Shut down
     151             :  * recovery-time transaction tracking.
     152             :  *
     153             :  * This must be called even in shutdown of startup process if transaction
     154             :  * tracking has been initialized. Otherwise some locks the tracked
     155             :  * transactions were holding will not be released and may interfere with
     156             :  * the processes still running (but will exit soon later) at the exit of
     157             :  * startup process.
     158             :  */
     159             : void
     160         276 : ShutdownRecoveryTransactionEnvironment(void)
     161             : {
     162             :     /*
     163             :      * Do nothing if RecoveryLockHash is NULL because that means that
     164             :      * transaction tracking has not yet been initialized or has already been
     165             :      * shut down.  This makes it safe to have possibly-redundant calls of this
     166             :      * function during process exit.
     167             :      */
     168         276 :     if (RecoveryLockHash == NULL)
     169          90 :         return;
     170             : 
     171             :     /* Mark all tracked in-progress transactions as finished. */
     172         186 :     ExpireAllKnownAssignedTransactionIds();
     173             : 
     174             :     /* Release all locks the tracked transactions were holding */
     175         186 :     StandbyReleaseAllLocks();
     176             : 
     177             :     /* Destroy the lock hash tables. */
     178         186 :     hash_destroy(RecoveryLockHash);
     179         186 :     hash_destroy(RecoveryLockXidHash);
     180         186 :     RecoveryLockHash = NULL;
     181         186 :     RecoveryLockXidHash = NULL;
     182             : 
     183             :     /* Cleanup our VirtualTransaction */
     184         186 :     VirtualXactLockTableCleanup();
     185             : }
     186             : 
     187             : 
     188             : /*
     189             :  * -----------------------------------------------------
     190             :  *      Standby wait timers and backend cancel logic
     191             :  * -----------------------------------------------------
     192             :  */
     193             : 
     194             : /*
     195             :  * Determine the cutoff time at which we want to start canceling conflicting
     196             :  * transactions.  Returns zero (a time safely in the past) if we are willing
     197             :  * to wait forever.
     198             :  */
     199             : static TimestampTz
     200          58 : GetStandbyLimitTime(void)
     201             : {
     202             :     TimestampTz rtime;
     203             :     bool        fromStream;
     204             : 
     205             :     /*
     206             :      * The cutoff time is the last WAL data receipt time plus the appropriate
     207             :      * delay variable.  Delay of -1 means wait forever.
     208             :      */
     209          58 :     GetXLogReceiptTime(&rtime, &fromStream);
     210          58 :     if (fromStream)
     211             :     {
     212          58 :         if (max_standby_streaming_delay < 0)
     213           0 :             return 0;           /* wait forever */
     214          58 :         return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
     215             :     }
     216             :     else
     217             :     {
     218           0 :         if (max_standby_archive_delay < 0)
     219           0 :             return 0;           /* wait forever */
     220           0 :         return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
     221             :     }
     222             : }
     223             : 
     224             : #define STANDBY_INITIAL_WAIT_US  1000
     225             : static int  standbyWait_us = STANDBY_INITIAL_WAIT_US;
     226             : 
     227             : /*
     228             :  * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
     229             :  * We wait here for a while then return. If we decide we can't wait any
     230             :  * more then we return true, if we can wait some more return false.
     231             :  */
     232             : static bool
     233          30 : WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
     234             : {
     235             :     TimestampTz ltime;
     236             : 
     237          30 :     CHECK_FOR_INTERRUPTS();
     238             : 
     239             :     /* Are we past the limit time? */
     240          30 :     ltime = GetStandbyLimitTime();
     241          30 :     if (ltime && GetCurrentTimestamp() >= ltime)
     242           6 :         return true;
     243             : 
     244             :     /*
     245             :      * Sleep a bit (this is essential to avoid busy-waiting).
     246             :      */
     247          24 :     pgstat_report_wait_start(wait_event_info);
     248          24 :     pg_usleep(standbyWait_us);
     249          24 :     pgstat_report_wait_end();
     250             : 
     251             :     /*
     252             :      * Progressively increase the sleep times, but not to more than 1s, since
     253             :      * pg_usleep isn't interruptible on some platforms.
     254             :      */
     255          24 :     standbyWait_us *= 2;
     256          24 :     if (standbyWait_us > 1000000)
     257           0 :         standbyWait_us = 1000000;
     258             : 
     259          24 :     return false;
     260             : }
     261             : 
     262             : /*
     263             :  * Log the recovery conflict.
     264             :  *
     265             :  * wait_start is the timestamp when the caller started to wait.
     266             :  * now is the timestamp when this function has been called.
     267             :  * wait_list is the list of virtual transaction ids assigned to
     268             :  * conflicting processes. still_waiting indicates whether
     269             :  * the startup process is still waiting for the recovery conflict
     270             :  * to be resolved or not.
     271             :  */
     272             : void
     273          20 : LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
     274             :                     TimestampTz now, VirtualTransactionId *wait_list,
     275             :                     bool still_waiting)
     276             : {
     277             :     long        secs;
     278             :     int         usecs;
     279             :     long        msecs;
     280             :     StringInfoData buf;
     281          20 :     int         nprocs = 0;
     282             : 
     283             :     /*
     284             :      * There must be no conflicting processes when the recovery conflict has
     285             :      * already been resolved.
     286             :      */
     287             :     Assert(still_waiting || wait_list == NULL);
     288             : 
     289          20 :     TimestampDifference(wait_start, now, &secs, &usecs);
     290          20 :     msecs = secs * 1000 + usecs / 1000;
     291          20 :     usecs = usecs % 1000;
     292             : 
     293          20 :     if (wait_list)
     294             :     {
     295             :         VirtualTransactionId *vxids;
     296             : 
     297             :         /* Construct a string of list of the conflicting processes */
     298           6 :         vxids = wait_list;
     299          12 :         while (VirtualTransactionIdIsValid(*vxids))
     300             :         {
     301           6 :             PGPROC     *proc = ProcNumberGetProc(vxids->procNumber);
     302             : 
     303             :             /* proc can be NULL if the target backend is not active */
     304           6 :             if (proc)
     305             :             {
     306           6 :                 if (nprocs == 0)
     307             :                 {
     308           6 :                     initStringInfo(&buf);
     309           6 :                     appendStringInfo(&buf, "%d", proc->pid);
     310             :                 }
     311             :                 else
     312           0 :                     appendStringInfo(&buf, ", %d", proc->pid);
     313             : 
     314           6 :                 nprocs++;
     315             :             }
     316             : 
     317           6 :             vxids++;
     318             :         }
     319             :     }
     320             : 
     321             :     /*
     322             :      * If wait_list is specified, report the list of PIDs of active
     323             :      * conflicting backends in a detail message. Note that if all the backends
     324             :      * in the list are not active, no detail message is logged.
     325             :      */
     326          20 :     if (still_waiting)
     327             :     {
     328          10 :         ereport(LOG,
     329             :                 errmsg("recovery still waiting after %ld.%03d ms: %s",
     330             :                        msecs, usecs, get_recovery_conflict_desc(reason)),
     331             :                 nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
     332             :                                                   "Conflicting processes: %s.",
     333             :                                                   nprocs, buf.data) : 0);
     334             :     }
     335             :     else
     336             :     {
     337          10 :         ereport(LOG,
     338             :                 errmsg("recovery finished waiting after %ld.%03d ms: %s",
     339             :                        msecs, usecs, get_recovery_conflict_desc(reason)));
     340             :     }
     341             : 
     342          20 :     if (nprocs > 0)
     343           6 :         pfree(buf.data);
     344          20 : }
     345             : 
     346             : /*
     347             :  * This is the main executioner for any query backend that conflicts with
     348             :  * recovery processing. Judgement has already been passed on it within
     349             :  * a specific rmgr. Here we just issue the orders to the procs. The procs
     350             :  * then throw the required error as instructed.
     351             :  *
     352             :  * If report_waiting is true, "waiting" is reported in PS display and the
     353             :  * wait for recovery conflict is reported in the log, if necessary. If
     354             :  * the caller is responsible for reporting them, report_waiting should be
     355             :  * false. Otherwise, both the caller and this function report the same
     356             :  * thing unexpectedly.
     357             :  */
     358             : static void
     359       17994 : ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
     360             :                                        ProcSignalReason reason, uint32 wait_event_info,
     361             :                                        bool report_waiting)
     362             : {
     363       17994 :     TimestampTz waitStart = 0;
     364       17994 :     bool        waiting = false;
     365       17994 :     bool        logged_recovery_conflict = false;
     366             : 
     367             :     /* Fast exit, to avoid a kernel call if there's no work to be done. */
     368       17994 :     if (!VirtualTransactionIdIsValid(*waitlist))
     369       17988 :         return;
     370             : 
     371             :     /* Set the wait start timestamp for reporting */
     372           6 :     if (report_waiting && (log_recovery_conflict_waits || update_process_title))
     373           4 :         waitStart = GetCurrentTimestamp();
     374             : 
     375          12 :     while (VirtualTransactionIdIsValid(*waitlist))
     376             :     {
     377             :         /* reset standbyWait_us for each xact we wait for */
     378           6 :         standbyWait_us = STANDBY_INITIAL_WAIT_US;
     379             : 
     380             :         /* wait until the virtual xid is gone */
     381          36 :         while (!VirtualXactLock(*waitlist, false))
     382             :         {
     383             :             /* Is it time to kill it? */
     384          30 :             if (WaitExceedsMaxStandbyDelay(wait_event_info))
     385             :             {
     386             :                 pid_t       pid;
     387             : 
     388             :                 /*
     389             :                  * Now find out who to throw out of the balloon.
     390             :                  */
     391             :                 Assert(VirtualTransactionIdIsValid(*waitlist));
     392           6 :                 pid = CancelVirtualTransaction(*waitlist, reason);
     393             : 
     394             :                 /*
     395             :                  * Wait a little bit for it to die so that we avoid flooding
     396             :                  * an unresponsive backend when system is heavily loaded.
     397             :                  */
     398           6 :                 if (pid != 0)
     399           6 :                     pg_usleep(5000L);
     400             :             }
     401             : 
     402          30 :             if (waitStart != 0 && (!logged_recovery_conflict || !waiting))
     403             :             {
     404          28 :                 TimestampTz now = 0;
     405             :                 bool        maybe_log_conflict;
     406             :                 bool        maybe_update_title;
     407             : 
     408          28 :                 maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
     409          28 :                 maybe_update_title = (update_process_title && !waiting);
     410             : 
     411             :                 /* Get the current timestamp if not report yet */
     412          28 :                 if (maybe_log_conflict || maybe_update_title)
     413          28 :                     now = GetCurrentTimestamp();
     414             : 
     415             :                 /*
     416             :                  * Report via ps if we have been waiting for more than 500
     417             :                  * msec (should that be configurable?)
     418             :                  */
     419          56 :                 if (maybe_update_title &&
     420          28 :                     TimestampDifferenceExceeds(waitStart, now, 500))
     421             :                 {
     422           0 :                     set_ps_display_suffix("waiting");
     423           0 :                     waiting = true;
     424             :                 }
     425             : 
     426             :                 /*
     427             :                  * Emit the log message if the startup process is waiting
     428             :                  * longer than deadlock_timeout for recovery conflict.
     429             :                  */
     430          44 :                 if (maybe_log_conflict &&
     431          16 :                     TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
     432             :                 {
     433           4 :                     LogRecoveryConflict(reason, waitStart, now, waitlist, true);
     434           4 :                     logged_recovery_conflict = true;
     435             :                 }
     436             :             }
     437             :         }
     438             : 
     439             :         /* The virtual transaction is gone now, wait for the next one */
     440           6 :         waitlist++;
     441             :     }
     442             : 
     443             :     /*
     444             :      * Emit the log message if recovery conflict was resolved but the startup
     445             :      * process waited longer than deadlock_timeout for it.
     446             :      */
     447           6 :     if (logged_recovery_conflict)
     448           4 :         LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
     449             :                             NULL, false);
     450             : 
     451             :     /* reset ps display to remove the suffix if we added one */
     452           6 :     if (waiting)
     453           0 :         set_ps_display_remove_suffix();
     454             : 
     455             : }
     456             : 
     457             : /*
     458             :  * Generate whatever recovery conflicts are needed to eliminate snapshots that
     459             :  * might see XIDs <= snapshotConflictHorizon as still running.
     460             :  *
     461             :  * snapshotConflictHorizon cutoffs are our standard approach to generating
     462             :  * granular recovery conflicts.  Note that InvalidTransactionId values are
     463             :  * interpreted as "definitely don't need any conflicts" here, which is a
     464             :  * general convention that WAL records can (and often do) depend on.
     465             :  */
     466             : void
     467       21662 : ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
     468             :                                     bool isCatalogRel,
     469             :                                     RelFileLocator locator)
     470             : {
     471             :     VirtualTransactionId *backends;
     472             : 
     473             :     /*
     474             :      * If we get passed InvalidTransactionId then we do nothing (no conflict).
     475             :      *
     476             :      * This can happen when replaying already-applied WAL records after a
     477             :      * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
     478             :      * record that marks as frozen a page which was already all-visible.  It's
     479             :      * also quite common with records generated during index deletion
     480             :      * (original execution of the deletion can reason that a recovery conflict
     481             :      * which is sufficient for the deletion operation must take place before
     482             :      * replay of the deletion record itself).
     483             :      */
     484       21662 :     if (!TransactionIdIsValid(snapshotConflictHorizon))
     485        3672 :         return;
     486             : 
     487             :     Assert(TransactionIdIsNormal(snapshotConflictHorizon));
     488       17990 :     backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
     489             :                                          locator.dbOid);
     490       17990 :     ResolveRecoveryConflictWithVirtualXIDs(backends,
     491             :                                            PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
     492             :                                            WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
     493             :                                            true);
     494             : 
     495             :     /*
     496             :      * Note that WaitExceedsMaxStandbyDelay() is not taken into account here
     497             :      * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That
     498             :      * seems OK, given that this kind of conflict should not normally be
     499             :      * reached, e.g. due to using a physical replication slot.
     500             :      */
     501       17990 :     if (wal_level >= WAL_LEVEL_LOGICAL && isCatalogRel)
     502          34 :         InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid,
     503             :                                            snapshotConflictHorizon);
     504             : }
     505             : 
     506             : /*
     507             :  * Variant of ResolveRecoveryConflictWithSnapshot that works with
     508             :  * FullTransactionId values
     509             :  */
     510             : void
     511           0 : ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
     512             :                                            bool isCatalogRel,
     513             :                                            RelFileLocator locator)
     514             : {
     515             :     /*
     516             :      * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
     517             :      * so truncate the logged FullTransactionId.  If the logged value is very
     518             :      * old, so that XID wrap-around already happened on it, there can't be any
     519             :      * snapshots that still see it.
     520             :      */
     521           0 :     FullTransactionId nextXid = ReadNextFullTransactionId();
     522             :     uint64      diff;
     523             : 
     524           0 :     diff = U64FromFullTransactionId(nextXid) -
     525           0 :         U64FromFullTransactionId(snapshotConflictHorizon);
     526           0 :     if (diff < MaxTransactionId / 2)
     527             :     {
     528             :         TransactionId truncated;
     529             : 
     530           0 :         truncated = XidFromFullTransactionId(snapshotConflictHorizon);
     531           0 :         ResolveRecoveryConflictWithSnapshot(truncated,
     532             :                                             isCatalogRel,
     533             :                                             locator);
     534             :     }
     535           0 : }
     536             : 
     537             : void
     538           2 : ResolveRecoveryConflictWithTablespace(Oid tsid)
     539             : {
     540             :     VirtualTransactionId *temp_file_users;
     541             : 
     542             :     /*
     543             :      * Standby users may be currently using this tablespace for their
     544             :      * temporary files. We only care about current users because
     545             :      * temp_tablespace parameter will just ignore tablespaces that no longer
     546             :      * exist.
     547             :      *
     548             :      * Ask everybody to cancel their queries immediately so we can ensure no
     549             :      * temp files remain and we can remove the tablespace. Nuke the entire
     550             :      * site from orbit, it's the only way to be sure.
     551             :      *
     552             :      * XXX: We could work out the pids of active backends using this
     553             :      * tablespace by examining the temp filenames in the directory. We would
     554             :      * then convert the pids into VirtualXIDs before attempting to cancel
     555             :      * them.
     556             :      *
     557             :      * We don't wait for commit because drop tablespace is non-transactional.
     558             :      */
     559           2 :     temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
     560             :                                                 InvalidOid);
     561           2 :     ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
     562             :                                            PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
     563             :                                            WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
     564             :                                            true);
     565           2 : }
     566             : 
     567             : void
     568          22 : ResolveRecoveryConflictWithDatabase(Oid dbid)
     569             : {
     570             :     /*
     571             :      * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
     572             :      * only waits for transactions and completely idle sessions would block
     573             :      * us. This is rare enough that we do this as simply as possible: no wait,
     574             :      * just force them off immediately.
     575             :      *
     576             :      * No locking is required here because we already acquired
     577             :      * AccessExclusiveLock. Anybody trying to connect while we do this will
     578             :      * block during InitPostgres() and then disconnect when they see the
     579             :      * database has been removed.
     580             :      */
     581          26 :     while (CountDBBackends(dbid) > 0)
     582             :     {
     583           4 :         CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
     584             : 
     585             :         /*
     586             :          * Wait awhile for them to die so that we avoid flooding an
     587             :          * unresponsive backend when system is heavily loaded.
     588             :          */
     589           4 :         pg_usleep(10000);
     590             :     }
     591          22 : }
     592             : 
     593             : /*
     594             :  * ResolveRecoveryConflictWithLock is called from ProcSleep()
     595             :  * to resolve conflicts with other backends holding relation locks.
     596             :  *
     597             :  * The WaitLatch sleep normally done in ProcSleep()
     598             :  * (when not InHotStandby) is performed here, for code clarity.
     599             :  *
     600             :  * We either resolve conflicts immediately or set a timeout to wake us at
     601             :  * the limit of our patience.
     602             :  *
     603             :  * Resolve conflicts by canceling to all backends holding a conflicting
     604             :  * lock.  As we are already queued to be granted the lock, no new lock
     605             :  * requests conflicting with ours will be granted in the meantime.
     606             :  *
     607             :  * We also must check for deadlocks involving the Startup process and
     608             :  * hot-standby backend processes. If deadlock_timeout is reached in
     609             :  * this function, all the backends holding the conflicting locks are
     610             :  * requested to check themselves for deadlocks.
     611             :  *
     612             :  * logging_conflict should be true if the recovery conflict has not been
     613             :  * logged yet even though logging is enabled. After deadlock_timeout is
     614             :  * reached and the request for deadlock check is sent, we wait again to
     615             :  * be signaled by the release of the lock if logging_conflict is false.
     616             :  * Otherwise we return without waiting again so that the caller can report
     617             :  * the recovery conflict. In this case, then, this function is called again
     618             :  * with logging_conflict=false (because the recovery conflict has already
     619             :  * been logged) and we will wait again for the lock to be released.
     620             :  */
     621             : void
     622           8 : ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
     623             : {
     624             :     TimestampTz ltime;
     625             :     TimestampTz now;
     626             : 
     627             :     Assert(InHotStandby);
     628             : 
     629           8 :     ltime = GetStandbyLimitTime();
     630           8 :     now = GetCurrentTimestamp();
     631             : 
     632             :     /*
     633             :      * Update waitStart if first time through after the startup process
     634             :      * started waiting for the lock. It should not be updated every time
     635             :      * ResolveRecoveryConflictWithLock() is called during the wait.
     636             :      *
     637             :      * Use the current time obtained for comparison with ltime as waitStart
     638             :      * (i.e., the time when this process started waiting for the lock). Since
     639             :      * getting the current time newly can cause overhead, we reuse the
     640             :      * already-obtained time to avoid that overhead.
     641             :      *
     642             :      * Note that waitStart is updated without holding the lock table's
     643             :      * partition lock, to avoid the overhead by additional lock acquisition.
     644             :      * This can cause "waitstart" in pg_locks to become NULL for a very short
     645             :      * period of time after the wait started even though "granted" is false.
     646             :      * This is OK in practice because we can assume that users are likely to
     647             :      * look at "waitstart" when waiting for the lock for a long time.
     648             :      */
     649           8 :     if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
     650           2 :         pg_atomic_write_u64(&MyProc->waitStart, now);
     651             : 
     652           8 :     if (now >= ltime && ltime != 0)
     653           2 :     {
     654             :         /*
     655             :          * We're already behind, so clear a path as quickly as possible.
     656             :          */
     657             :         VirtualTransactionId *backends;
     658             : 
     659           2 :         backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
     660             : 
     661             :         /*
     662             :          * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
     663             :          * "waiting" in PS display by disabling its argument report_waiting
     664             :          * because the caller, WaitOnLock(), has already reported that.
     665             :          */
     666           2 :         ResolveRecoveryConflictWithVirtualXIDs(backends,
     667             :                                                PROCSIG_RECOVERY_CONFLICT_LOCK,
     668           2 :                                                PG_WAIT_LOCK | locktag.locktag_type,
     669             :                                                false);
     670             :     }
     671             :     else
     672             :     {
     673             :         /*
     674             :          * Wait (or wait again) until ltime, and check for deadlocks as well
     675             :          * if we will be waiting longer than deadlock_timeout
     676             :          */
     677             :         EnableTimeoutParams timeouts[2];
     678           6 :         int         cnt = 0;
     679             : 
     680           6 :         if (ltime != 0)
     681             :         {
     682           6 :             got_standby_lock_timeout = false;
     683           6 :             timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
     684           6 :             timeouts[cnt].type = TMPARAM_AT;
     685           6 :             timeouts[cnt].fin_time = ltime;
     686           6 :             cnt++;
     687             :         }
     688             : 
     689           6 :         got_standby_deadlock_timeout = false;
     690           6 :         timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
     691           6 :         timeouts[cnt].type = TMPARAM_AFTER;
     692           6 :         timeouts[cnt].delay_ms = DeadlockTimeout;
     693           6 :         cnt++;
     694             : 
     695           6 :         enable_timeouts(timeouts, cnt);
     696             :     }
     697             : 
     698             :     /* Wait to be signaled by the release of the Relation Lock */
     699           8 :     ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
     700             : 
     701             :     /*
     702             :      * Exit if ltime is reached. Then all the backends holding conflicting
     703             :      * locks will be canceled in the next ResolveRecoveryConflictWithLock()
     704             :      * call.
     705             :      */
     706           8 :     if (got_standby_lock_timeout)
     707           0 :         goto cleanup;
     708             : 
     709           8 :     if (got_standby_deadlock_timeout)
     710             :     {
     711             :         VirtualTransactionId *backends;
     712             : 
     713           4 :         backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
     714             : 
     715             :         /* Quick exit if there's no work to be done */
     716           4 :         if (!VirtualTransactionIdIsValid(*backends))
     717           0 :             goto cleanup;
     718             : 
     719             :         /*
     720             :          * Send signals to all the backends holding the conflicting locks, to
     721             :          * ask them to check themselves for deadlocks.
     722             :          */
     723           8 :         while (VirtualTransactionIdIsValid(*backends))
     724             :         {
     725           4 :             SignalVirtualTransaction(*backends,
     726             :                                      PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
     727             :                                      false);
     728           4 :             backends++;
     729             :         }
     730             : 
     731             :         /*
     732             :          * Exit if the recovery conflict has not been logged yet even though
     733             :          * logging is enabled, so that the caller can log that. Then
     734             :          * RecoveryConflictWithLock() is called again and we will wait again
     735             :          * for the lock to be released.
     736             :          */
     737           4 :         if (logging_conflict)
     738           2 :             goto cleanup;
     739             : 
     740             :         /*
     741             :          * Wait again here to be signaled by the release of the Relation Lock,
     742             :          * to prevent the subsequent RecoveryConflictWithLock() from causing
     743             :          * deadlock_timeout and sending a request for deadlocks check again.
     744             :          * Otherwise the request continues to be sent every deadlock_timeout
     745             :          * until the relation locks are released or ltime is reached.
     746             :          */
     747           2 :         got_standby_deadlock_timeout = false;
     748           2 :         ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
     749             :     }
     750             : 
     751           4 : cleanup:
     752             : 
     753             :     /*
     754             :      * Clear any timeout requests established above.  We assume here that the
     755             :      * Startup process doesn't have any other outstanding timeouts than those
     756             :      * used by this function. If that stops being true, we could cancel the
     757             :      * timeouts individually, but that'd be slower.
     758             :      */
     759           8 :     disable_all_timeouts(false);
     760           8 :     got_standby_lock_timeout = false;
     761           8 :     got_standby_deadlock_timeout = false;
     762           8 : }
     763             : 
     764             : /*
     765             :  * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
     766             :  * to resolve conflicts with other backends holding buffer pins.
     767             :  *
     768             :  * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
     769             :  * (when not InHotStandby) is performed here, for code clarity.
     770             :  *
     771             :  * We either resolve conflicts immediately or set a timeout to wake us at
     772             :  * the limit of our patience.
     773             :  *
     774             :  * Resolve conflicts by sending a PROCSIG signal to all backends to check if
     775             :  * they hold one of the buffer pins that is blocking Startup process. If so,
     776             :  * those backends will take an appropriate error action, ERROR or FATAL.
     777             :  *
     778             :  * We also must check for deadlocks.  Deadlocks occur because if queries
     779             :  * wait on a lock, that must be behind an AccessExclusiveLock, which can only
     780             :  * be cleared if the Startup process replays a transaction completion record.
     781             :  * If Startup process is also waiting then that is a deadlock. The deadlock
     782             :  * can occur if the query is waiting and then the Startup sleeps, or if
     783             :  * Startup is sleeping and the query waits on a lock. We protect against
     784             :  * only the former sequence here, the latter sequence is checked prior to
     785             :  * the query sleeping, in CheckRecoveryConflictDeadlock().
     786             :  *
     787             :  * Deadlocks are extremely rare, and relatively expensive to check for,
     788             :  * so we don't do a deadlock check right away ... only if we have had to wait
     789             :  * at least deadlock_timeout.
     790             :  */
     791             : void
     792          20 : ResolveRecoveryConflictWithBufferPin(void)
     793             : {
     794             :     TimestampTz ltime;
     795             : 
     796             :     Assert(InHotStandby);
     797             : 
     798          20 :     ltime = GetStandbyLimitTime();
     799             : 
     800          20 :     if (GetCurrentTimestamp() >= ltime && ltime != 0)
     801             :     {
     802             :         /*
     803             :          * We're already behind, so clear a path as quickly as possible.
     804             :          */
     805           2 :         SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
     806             :     }
     807             :     else
     808             :     {
     809             :         /*
     810             :          * Wake up at ltime, and check for deadlocks as well if we will be
     811             :          * waiting longer than deadlock_timeout
     812             :          */
     813             :         EnableTimeoutParams timeouts[2];
     814          18 :         int         cnt = 0;
     815             : 
     816          18 :         if (ltime != 0)
     817             :         {
     818          18 :             timeouts[cnt].id = STANDBY_TIMEOUT;
     819          18 :             timeouts[cnt].type = TMPARAM_AT;
     820          18 :             timeouts[cnt].fin_time = ltime;
     821          18 :             cnt++;
     822             :         }
     823             : 
     824          18 :         got_standby_deadlock_timeout = false;
     825          18 :         timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
     826          18 :         timeouts[cnt].type = TMPARAM_AFTER;
     827          18 :         timeouts[cnt].delay_ms = DeadlockTimeout;
     828          18 :         cnt++;
     829             : 
     830          18 :         enable_timeouts(timeouts, cnt);
     831             :     }
     832             : 
     833             :     /*
     834             :      * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
     835             :      * by one of the timeouts established above.
     836             :      *
     837             :      * We assume that only UnpinBuffer() and the timeout requests established
     838             :      * above can wake us up here. WakeupRecovery() called by walreceiver or
     839             :      * SIGHUP signal handler, etc cannot do that because it uses the different
     840             :      * latch from that ProcWaitForSignal() waits on.
     841             :      */
     842          20 :     ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
     843             : 
     844          20 :     if (got_standby_delay_timeout)
     845           2 :         SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
     846          18 :     else if (got_standby_deadlock_timeout)
     847             :     {
     848             :         /*
     849             :          * Send out a request for hot-standby backends to check themselves for
     850             :          * deadlocks.
     851             :          *
     852             :          * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
     853             :          * to be signaled by UnpinBuffer() again and send a request for
     854             :          * deadlocks check if deadlock_timeout happens. This causes the
     855             :          * request to continue to be sent every deadlock_timeout until the
     856             :          * buffer is unpinned or ltime is reached. This would increase the
     857             :          * workload in the startup process and backends. In practice it may
     858             :          * not be so harmful because the period that the buffer is kept pinned
     859             :          * is basically no so long. But we should fix this?
     860             :          */
     861          12 :         SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
     862             :     }
     863             : 
     864             :     /*
     865             :      * Clear any timeout requests established above.  We assume here that the
     866             :      * Startup process doesn't have any other timeouts than what this function
     867             :      * uses.  If that stops being true, we could cancel the timeouts
     868             :      * individually, but that'd be slower.
     869             :      */
     870          20 :     disable_all_timeouts(false);
     871          20 :     got_standby_delay_timeout = false;
     872          20 :     got_standby_deadlock_timeout = false;
     873          20 : }
     874             : 
     875             : static void
     876          16 : SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
     877             : {
     878             :     Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
     879             :            reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
     880             : 
     881             :     /*
     882             :      * We send signal to all backends to ask them if they are holding the
     883             :      * buffer pin which is delaying the Startup process. We must not set the
     884             :      * conflict flag yet, since most backends will be innocent. Let the
     885             :      * SIGUSR1 handling in each backend decide their own fate.
     886             :      */
     887          16 :     CancelDBBackends(InvalidOid, reason, false);
     888          16 : }
     889             : 
     890             : /*
     891             :  * In Hot Standby perform early deadlock detection.  We abort the lock
     892             :  * wait if we are about to sleep while holding the buffer pin that Startup
     893             :  * process is waiting for.
     894             :  *
     895             :  * Note: this code is pessimistic, because there is no way for it to
     896             :  * determine whether an actual deadlock condition is present: the lock we
     897             :  * need to wait for might be unrelated to any held by the Startup process.
     898             :  * Sooner or later, this mechanism should get ripped out in favor of somehow
     899             :  * accounting for buffer locks in DeadLockCheck().  However, errors here
     900             :  * seem to be very low-probability in practice, so for now it's not worth
     901             :  * the trouble.
     902             :  */
     903             : void
     904           2 : CheckRecoveryConflictDeadlock(void)
     905             : {
     906             :     Assert(!InRecovery);        /* do not call in Startup process */
     907             : 
     908           2 :     if (!HoldingBufferPinThatDelaysRecovery())
     909           2 :         return;
     910             : 
     911             :     /*
     912             :      * Error message should match ProcessInterrupts() but we avoid calling
     913             :      * that because we aren't handling an interrupt at this point. Note that
     914             :      * we only cancel the current transaction here, so if we are in a
     915             :      * subtransaction and the pin is held by a parent, then the Startup
     916             :      * process will continue to wait even though we have avoided deadlock.
     917             :      */
     918           0 :     ereport(ERROR,
     919             :             (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
     920             :              errmsg("canceling statement due to conflict with recovery"),
     921             :              errdetail("User transaction caused buffer deadlock with recovery.")));
     922             : }
     923             : 
     924             : 
     925             : /* --------------------------------
     926             :  *      timeout handler routines
     927             :  * --------------------------------
     928             :  */
     929             : 
     930             : /*
     931             :  * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
     932             :  * exceeded.
     933             :  */
     934             : void
     935          16 : StandbyDeadLockHandler(void)
     936             : {
     937          16 :     got_standby_deadlock_timeout = true;
     938          16 : }
     939             : 
     940             : /*
     941             :  * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
     942             :  */
     943             : void
     944           2 : StandbyTimeoutHandler(void)
     945             : {
     946           2 :     got_standby_delay_timeout = true;
     947           2 : }
     948             : 
     949             : /*
     950             :  * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
     951             :  */
     952             : void
     953           2 : StandbyLockTimeoutHandler(void)
     954             : {
     955           2 :     got_standby_lock_timeout = true;
     956           2 : }
     957             : 
     958             : /*
     959             :  * -----------------------------------------------------
     960             :  * Locking in Recovery Mode
     961             :  * -----------------------------------------------------
     962             :  *
     963             :  * All locks are held by the Startup process using a single virtual
     964             :  * transaction. This implementation is both simpler and in some senses,
     965             :  * more correct. The locks held mean "some original transaction held
     966             :  * this lock, so query access is not allowed at this time". So the Startup
     967             :  * process is the proxy by which the original locks are implemented.
     968             :  *
     969             :  * We only keep track of AccessExclusiveLocks, which are only ever held by
     970             :  * one transaction on one relation.
     971             :  *
     972             :  * We keep a table of known locks in the RecoveryLockHash hash table.
     973             :  * The point of that table is to let us efficiently de-duplicate locks,
     974             :  * which is important because checkpoints will re-report the same locks
     975             :  * already held.  There is also a RecoveryLockXidHash table with one entry
     976             :  * per xid, which allows us to efficiently find all the locks held by a
     977             :  * given original transaction.
     978             :  *
     979             :  * We use session locks rather than normal locks so we don't need
     980             :  * ResourceOwners.
     981             :  */
     982             : 
     983             : 
     984             : void
     985       43256 : StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
     986             : {
     987             :     RecoveryLockXidEntry *xidentry;
     988             :     RecoveryLockEntry *lockentry;
     989             :     xl_standby_lock key;
     990             :     LOCKTAG     locktag;
     991             :     bool        found;
     992             : 
     993             :     /* Already processed? */
     994       86512 :     if (!TransactionIdIsValid(xid) ||
     995       86504 :         TransactionIdDidCommit(xid) ||
     996       43248 :         TransactionIdDidAbort(xid))
     997           8 :         return;
     998             : 
     999       43248 :     elog(DEBUG4, "adding recovery lock: db %u rel %u", dbOid, relOid);
    1000             : 
    1001             :     /* dbOid is InvalidOid when we are locking a shared relation. */
    1002             :     Assert(OidIsValid(relOid));
    1003             : 
    1004             :     /* Create a hash entry for this xid, if we don't have one already. */
    1005       43248 :     xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
    1006       43248 :     if (!found)
    1007             :     {
    1008             :         Assert(xidentry->xid == xid);    /* dynahash should have set this */
    1009       19318 :         xidentry->head = NULL;
    1010             :     }
    1011             : 
    1012             :     /* Create a hash entry for this lock, unless we have one already. */
    1013       43248 :     key.xid = xid;
    1014       43248 :     key.dbOid = dbOid;
    1015       43248 :     key.relOid = relOid;
    1016       43248 :     lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
    1017       43248 :     if (!found)
    1018             :     {
    1019             :         /* It's new, so link it into the XID's list ... */
    1020       43072 :         lockentry->next = xidentry->head;
    1021       43072 :         xidentry->head = lockentry;
    1022             : 
    1023             :         /* ... and acquire the lock locally. */
    1024       43072 :         SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
    1025             : 
    1026       43072 :         (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
    1027             :     }
    1028             : }
    1029             : 
    1030             : /*
    1031             :  * Release all the locks associated with this RecoveryLockXidEntry.
    1032             :  */
    1033             : static void
    1034       19318 : StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
    1035             : {
    1036             :     RecoveryLockEntry *entry;
    1037             :     RecoveryLockEntry *next;
    1038             : 
    1039       62390 :     for (entry = xidentry->head; entry != NULL; entry = next)
    1040             :     {
    1041             :         LOCKTAG     locktag;
    1042             : 
    1043       43072 :         elog(DEBUG4,
    1044             :              "releasing recovery lock: xid %u db %u rel %u",
    1045             :              entry->key.xid, entry->key.dbOid, entry->key.relOid);
    1046             :         /* Release the lock ... */
    1047       43072 :         SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
    1048       43072 :         if (!LockRelease(&locktag, AccessExclusiveLock, true))
    1049             :         {
    1050           0 :             elog(LOG,
    1051             :                  "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
    1052             :                  entry->key.xid, entry->key.dbOid, entry->key.relOid);
    1053             :             Assert(false);
    1054             :         }
    1055             :         /* ... and remove the per-lock hash entry */
    1056       43072 :         next = entry->next;
    1057       43072 :         hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
    1058             :     }
    1059             : 
    1060       19318 :     xidentry->head = NULL;       /* just for paranoia */
    1061       19318 : }
    1062             : 
    1063             : /*
    1064             :  * Release locks for specific XID, or all locks if it's InvalidXid.
    1065             :  */
    1066             : static void
    1067       20668 : StandbyReleaseLocks(TransactionId xid)
    1068             : {
    1069             :     RecoveryLockXidEntry *entry;
    1070             : 
    1071       20668 :     if (TransactionIdIsValid(xid))
    1072             :     {
    1073       20668 :         if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
    1074             :         {
    1075       19318 :             StandbyReleaseXidEntryLocks(entry);
    1076       19318 :             hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1077             :         }
    1078             :     }
    1079             :     else
    1080           0 :         StandbyReleaseAllLocks();
    1081       20668 : }
    1082             : 
    1083             : /*
    1084             :  * Release locks for a transaction tree, starting at xid down, from
    1085             :  * RecoveryLockXidHash.
    1086             :  *
    1087             :  * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
    1088             :  * to remove any AccessExclusiveLocks requested by a transaction.
    1089             :  */
    1090             : void
    1091       19668 : StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
    1092             : {
    1093             :     int         i;
    1094             : 
    1095       19668 :     StandbyReleaseLocks(xid);
    1096             : 
    1097       20668 :     for (i = 0; i < nsubxids; i++)
    1098        1000 :         StandbyReleaseLocks(subxids[i]);
    1099       19668 : }
    1100             : 
    1101             : /*
    1102             :  * Called at end of recovery and when we see a shutdown checkpoint.
    1103             :  */
    1104             : void
    1105         186 : StandbyReleaseAllLocks(void)
    1106             : {
    1107             :     HASH_SEQ_STATUS status;
    1108             :     RecoveryLockXidEntry *entry;
    1109             : 
    1110         186 :     elog(DEBUG2, "release all standby locks");
    1111             : 
    1112         186 :     hash_seq_init(&status, RecoveryLockXidHash);
    1113         186 :     while ((entry = hash_seq_search(&status)))
    1114             :     {
    1115           0 :         StandbyReleaseXidEntryLocks(entry);
    1116           0 :         hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1117             :     }
    1118         186 : }
    1119             : 
    1120             : /*
    1121             :  * StandbyReleaseOldLocks
    1122             :  *      Release standby locks held by top-level XIDs that aren't running,
    1123             :  *      as long as they're not prepared transactions.
    1124             :  */
    1125             : void
    1126         508 : StandbyReleaseOldLocks(TransactionId oldxid)
    1127             : {
    1128             :     HASH_SEQ_STATUS status;
    1129             :     RecoveryLockXidEntry *entry;
    1130             : 
    1131         508 :     hash_seq_init(&status, RecoveryLockXidHash);
    1132         548 :     while ((entry = hash_seq_search(&status)))
    1133             :     {
    1134             :         Assert(TransactionIdIsValid(entry->xid));
    1135             : 
    1136             :         /* Skip if prepared transaction. */
    1137          40 :         if (StandbyTransactionIdIsPrepared(entry->xid))
    1138           0 :             continue;
    1139             : 
    1140             :         /* Skip if >= oldxid. */
    1141          40 :         if (!TransactionIdPrecedes(entry->xid, oldxid))
    1142          40 :             continue;
    1143             : 
    1144             :         /* Remove all locks and hash table entry. */
    1145           0 :         StandbyReleaseXidEntryLocks(entry);
    1146           0 :         hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1147             :     }
    1148         508 : }
    1149             : 
    1150             : /*
    1151             :  * --------------------------------------------------------------------
    1152             :  *      Recovery handling for Rmgr RM_STANDBY_ID
    1153             :  *
    1154             :  * These record types will only be created if XLogStandbyInfoActive()
    1155             :  * --------------------------------------------------------------------
    1156             :  */
    1157             : 
    1158             : void
    1159       44892 : standby_redo(XLogReaderState *record)
    1160             : {
    1161       44892 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    1162             : 
    1163             :     /* Backup blocks are not used in standby records */
    1164             :     Assert(!XLogRecHasAnyBlockRefs(record));
    1165             : 
    1166             :     /* Do nothing if we're not in hot standby mode */
    1167       44892 :     if (standbyState == STANDBY_DISABLED)
    1168         280 :         return;
    1169             : 
    1170       44612 :     if (info == XLOG_STANDBY_LOCK)
    1171             :     {
    1172       43156 :         xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
    1173             :         int         i;
    1174             : 
    1175       86412 :         for (i = 0; i < xlrec->nlocks; i++)
    1176       43256 :             StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
    1177             :                                               xlrec->locks[i].dbOid,
    1178             :                                               xlrec->locks[i].relOid);
    1179             :     }
    1180        1456 :     else if (info == XLOG_RUNNING_XACTS)
    1181             :     {
    1182         408 :         xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
    1183             :         RunningTransactionsData running;
    1184             : 
    1185         408 :         running.xcnt = xlrec->xcnt;
    1186         408 :         running.subxcnt = xlrec->subxcnt;
    1187         408 :         running.subxid_overflow = xlrec->subxid_overflow;
    1188         408 :         running.nextXid = xlrec->nextXid;
    1189         408 :         running.latestCompletedXid = xlrec->latestCompletedXid;
    1190         408 :         running.oldestRunningXid = xlrec->oldestRunningXid;
    1191         408 :         running.xids = xlrec->xids;
    1192             : 
    1193         408 :         ProcArrayApplyRecoveryInfo(&running);
    1194             : 
    1195             :         /*
    1196             :          * The startup process currently has no convenient way to schedule
    1197             :          * stats to be reported. XLOG_RUNNING_XACTS records issued at a
    1198             :          * regular cadence, making this a convenient location to report stats.
    1199             :          * While these records aren't generated with wal_level=minimal, stats
    1200             :          * also cannot be accessed during WAL replay.
    1201             :          */
    1202         408 :         pgstat_report_stat(true);
    1203             :     }
    1204        1048 :     else if (info == XLOG_INVALIDATIONS)
    1205             :     {
    1206        1048 :         xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
    1207             : 
    1208        1048 :         ProcessCommittedInvalidationMessages(xlrec->msgs,
    1209             :                                              xlrec->nmsgs,
    1210        1048 :                                              xlrec->relcacheInitFileInval,
    1211             :                                              xlrec->dbId,
    1212             :                                              xlrec->tsId);
    1213             :     }
    1214             :     else
    1215           0 :         elog(PANIC, "standby_redo: unknown op code %u", info);
    1216             : }
    1217             : 
    1218             : /*
    1219             :  * Log details of the current snapshot to WAL. This allows the snapshot state
    1220             :  * to be reconstructed on the standby and for logical decoding.
    1221             :  *
    1222             :  * This is used for Hot Standby as follows:
    1223             :  *
    1224             :  * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
    1225             :  * start from a shutdown checkpoint because we know nothing was running
    1226             :  * at that time and our recovery snapshot is known empty. In the more
    1227             :  * typical case of an online checkpoint we need to jump through a few
    1228             :  * hoops to get a correct recovery snapshot and this requires a two or
    1229             :  * sometimes a three stage process.
    1230             :  *
    1231             :  * The initial snapshot must contain all running xids and all current
    1232             :  * AccessExclusiveLocks at a point in time on the standby. Assembling
    1233             :  * that information while the server is running requires many and
    1234             :  * various LWLocks, so we choose to derive that information piece by
    1235             :  * piece and then re-assemble that info on the standby. When that
    1236             :  * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
    1237             :  *
    1238             :  * Since locking on the primary when we derive the information is not
    1239             :  * strict, we note that there is a time window between the derivation and
    1240             :  * writing to WAL of the derived information. That allows race conditions
    1241             :  * that we must resolve, since xids and locks may enter or leave the
    1242             :  * snapshot during that window. This creates the issue that an xid or
    1243             :  * lock may start *after* the snapshot has been derived yet *before* the
    1244             :  * snapshot is logged in the running xacts WAL record. We resolve this by
    1245             :  * starting to accumulate changes at a point just prior to when we derive
    1246             :  * the snapshot on the primary, then ignore duplicates when we later apply
    1247             :  * the snapshot from the running xacts record. This is implemented during
    1248             :  * CreateCheckPoint() where we use the logical checkpoint location as
    1249             :  * our starting point and then write the running xacts record immediately
    1250             :  * before writing the main checkpoint WAL record. Since we always start
    1251             :  * up from a checkpoint and are immediately at our starting point, we
    1252             :  * unconditionally move to STANDBY_INITIALIZED. After this point we
    1253             :  * must do 4 things:
    1254             :  *  * move shared nextXid forwards as we see new xids
    1255             :  *  * extend the clog and subtrans with each new xid
    1256             :  *  * keep track of uncommitted known assigned xids
    1257             :  *  * keep track of uncommitted AccessExclusiveLocks
    1258             :  *
    1259             :  * When we see a commit/abort we must remove known assigned xids and locks
    1260             :  * from the completing transaction. Attempted removals that cannot locate
    1261             :  * an entry are expected and must not cause an error when we are in state
    1262             :  * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
    1263             :  * KnownAssignedXidsRemove().
    1264             :  *
    1265             :  * Later, when we apply the running xact data we must be careful to ignore
    1266             :  * transactions already committed, since those commits raced ahead when
    1267             :  * making WAL entries.
    1268             :  *
    1269             :  * The loose timing also means that locks may be recorded that have a
    1270             :  * zero xid, since xids are removed from procs before locks are removed.
    1271             :  * So we must prune the lock list down to ensure we hold locks only for
    1272             :  * currently running xids, performed by StandbyReleaseOldLocks().
    1273             :  * Zero xids should no longer be possible, but we may be replaying WAL
    1274             :  * from a time when they were possible.
    1275             :  *
    1276             :  * For logical decoding only the running xacts information is needed;
    1277             :  * there's no need to look at the locking information, but it's logged anyway,
    1278             :  * as there's no independent knob to just enable logical decoding. For
    1279             :  * details of how this is used, check snapbuild.c's introductory comment.
    1280             :  *
    1281             :  *
    1282             :  * Returns the RecPtr of the last inserted record.
    1283             :  */
    1284             : XLogRecPtr
    1285        1482 : LogStandbySnapshot(void)
    1286             : {
    1287             :     XLogRecPtr  recptr;
    1288             :     RunningTransactions running;
    1289             :     xl_standby_lock *locks;
    1290             :     int         nlocks;
    1291             : 
    1292             :     Assert(XLogStandbyInfoActive());
    1293             : 
    1294             :     /*
    1295             :      * Get details of any AccessExclusiveLocks being held at the moment.
    1296             :      */
    1297        1482 :     locks = GetRunningTransactionLocks(&nlocks);
    1298        1482 :     if (nlocks > 0)
    1299          50 :         LogAccessExclusiveLocks(nlocks, locks);
    1300        1482 :     pfree(locks);
    1301             : 
    1302             :     /*
    1303             :      * Log details of all in-progress transactions. This should be the last
    1304             :      * record we write, because standby will open up when it sees this.
    1305             :      */
    1306        1482 :     running = GetRunningTransactionData();
    1307             : 
    1308             :     /*
    1309             :      * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
    1310             :      * For Hot Standby this can be done before inserting the WAL record
    1311             :      * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
    1312             :      * the clog. For logical decoding, though, the lock can't be released
    1313             :      * early because the clog might be "in the future" from the POV of the
    1314             :      * historic snapshot. This would allow for situations where we're waiting
    1315             :      * for the end of a transaction listed in the xl_running_xacts record
    1316             :      * which, according to the WAL, has committed before the xl_running_xacts
    1317             :      * record. Fortunately this routine isn't executed frequently, and it's
    1318             :      * only a shared lock.
    1319             :      */
    1320        1482 :     if (wal_level < WAL_LEVEL_LOGICAL)
    1321         612 :         LWLockRelease(ProcArrayLock);
    1322             : 
    1323        1482 :     recptr = LogCurrentRunningXacts(running);
    1324             : 
    1325             :     /* Release lock if we kept it longer ... */
    1326        1482 :     if (wal_level >= WAL_LEVEL_LOGICAL)
    1327         870 :         LWLockRelease(ProcArrayLock);
    1328             : 
    1329             :     /* GetRunningTransactionData() acquired XidGenLock, we must release it */
    1330        1482 :     LWLockRelease(XidGenLock);
    1331             : 
    1332        1482 :     return recptr;
    1333             : }
    1334             : 
    1335             : /*
    1336             :  * Record an enhanced snapshot of running transactions into WAL.
    1337             :  *
    1338             :  * The definitions of RunningTransactionsData and xl_running_xacts are
    1339             :  * similar. We keep them separate because xl_running_xacts is a contiguous
    1340             :  * chunk of memory and never exists fully until it is assembled in WAL.
    1341             :  * The inserted records are marked as not being important for durability,
    1342             :  * to avoid triggering superfluous checkpoint / archiving activity.
    1343             :  */
    1344             : static XLogRecPtr
    1345        1482 : LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
    1346             : {
    1347             :     xl_running_xacts xlrec;
    1348             :     XLogRecPtr  recptr;
    1349             : 
    1350        1482 :     xlrec.xcnt = CurrRunningXacts->xcnt;
    1351        1482 :     xlrec.subxcnt = CurrRunningXacts->subxcnt;
    1352        1482 :     xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
    1353        1482 :     xlrec.nextXid = CurrRunningXacts->nextXid;
    1354        1482 :     xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
    1355        1482 :     xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
    1356             : 
    1357             :     /* Header */
    1358        1482 :     XLogBeginInsert();
    1359        1482 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    1360        1482 :     XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
    1361             : 
    1362             :     /* array of TransactionIds */
    1363        1482 :     if (xlrec.xcnt > 0)
    1364         464 :         XLogRegisterData((char *) CurrRunningXacts->xids,
    1365         464 :                          (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
    1366             : 
    1367        1482 :     recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
    1368             : 
    1369        1482 :     if (CurrRunningXacts->subxid_overflow)
    1370           4 :         elog(DEBUG2,
    1371             :              "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
    1372             :              CurrRunningXacts->xcnt,
    1373             :              LSN_FORMAT_ARGS(recptr),
    1374             :              CurrRunningXacts->oldestRunningXid,
    1375             :              CurrRunningXacts->latestCompletedXid,
    1376             :              CurrRunningXacts->nextXid);
    1377             :     else
    1378        1478 :         elog(DEBUG2,
    1379             :              "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
    1380             :              CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
    1381             :              LSN_FORMAT_ARGS(recptr),
    1382             :              CurrRunningXacts->oldestRunningXid,
    1383             :              CurrRunningXacts->latestCompletedXid,
    1384             :              CurrRunningXacts->nextXid);
    1385             : 
    1386             :     /*
    1387             :      * Ensure running_xacts information is synced to disk not too far in the
    1388             :      * future. We don't want to stall anything though (i.e. use XLogFlush()),
    1389             :      * so we let the wal writer do it during normal operation.
    1390             :      * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
    1391             :      * and nudge the WALWriter into action if sleeping. Check
    1392             :      * XLogBackgroundFlush() for details why a record might not be flushed
    1393             :      * without it.
    1394             :      */
    1395        1482 :     XLogSetAsyncXactLSN(recptr);
    1396             : 
    1397        1482 :     return recptr;
    1398             : }
    1399             : 
    1400             : /*
    1401             :  * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
    1402             :  * logged, as described in backend/storage/lmgr/README.
    1403             :  */
    1404             : static void
    1405      163860 : LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
    1406             : {
    1407             :     xl_standby_locks xlrec;
    1408             : 
    1409      163860 :     xlrec.nlocks = nlocks;
    1410             : 
    1411      163860 :     XLogBeginInsert();
    1412      163860 :     XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
    1413      163860 :     XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
    1414      163860 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    1415             : 
    1416      163860 :     (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
    1417      163860 : }
    1418             : 
    1419             : /*
    1420             :  * Individual logging of AccessExclusiveLocks for use during LockAcquire()
    1421             :  */
    1422             : void
    1423      163810 : LogAccessExclusiveLock(Oid dbOid, Oid relOid)
    1424             : {
    1425             :     xl_standby_lock xlrec;
    1426             : 
    1427      163810 :     xlrec.xid = GetCurrentTransactionId();
    1428             : 
    1429      163810 :     xlrec.dbOid = dbOid;
    1430      163810 :     xlrec.relOid = relOid;
    1431             : 
    1432      163810 :     LogAccessExclusiveLocks(1, &xlrec);
    1433      163810 :     MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
    1434      163810 : }
    1435             : 
    1436             : /*
    1437             :  * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
    1438             :  */
    1439             : void
    1440      164234 : LogAccessExclusiveLockPrepare(void)
    1441             : {
    1442             :     /*
    1443             :      * Ensure that a TransactionId has been assigned to this transaction, for
    1444             :      * two reasons, both related to lock release on the standby. First, we
    1445             :      * must assign an xid so that RecordTransactionCommit() and
    1446             :      * RecordTransactionAbort() do not optimise away the transaction
    1447             :      * completion record which recovery relies upon to release locks. It's a
    1448             :      * hack, but for a corner case not worth adding code for into the main
    1449             :      * commit path. Second, we must assign an xid before the lock is recorded
    1450             :      * in shared memory, otherwise a concurrently executing
    1451             :      * GetRunningTransactionLocks() might see a lock associated with an
    1452             :      * InvalidTransactionId which we later assert cannot happen.
    1453             :      */
    1454      164234 :     (void) GetCurrentTransactionId();
    1455      164234 : }
    1456             : 
    1457             : /*
    1458             :  * Emit WAL for invalidations. This currently is only used for commits without
    1459             :  * an xid but which contain invalidations.
    1460             :  */
    1461             : void
    1462       13940 : LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
    1463             :                         bool relcacheInitFileInval)
    1464             : {
    1465             :     xl_invalidations xlrec;
    1466             : 
    1467             :     /* prepare record */
    1468       13940 :     memset(&xlrec, 0, sizeof(xlrec));
    1469       13940 :     xlrec.dbId = MyDatabaseId;
    1470       13940 :     xlrec.tsId = MyDatabaseTableSpace;
    1471       13940 :     xlrec.relcacheInitFileInval = relcacheInitFileInval;
    1472       13940 :     xlrec.nmsgs = nmsgs;
    1473             : 
    1474             :     /* perform insertion */
    1475       13940 :     XLogBeginInsert();
    1476       13940 :     XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations);
    1477       13940 :     XLogRegisterData((char *) msgs,
    1478             :                      nmsgs * sizeof(SharedInvalidationMessage));
    1479       13940 :     XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
    1480       13940 : }
    1481             : 
    1482             : /* Return the description of recovery conflict */
    1483             : static const char *
    1484          20 : get_recovery_conflict_desc(ProcSignalReason reason)
    1485             : {
    1486          20 :     const char *reasonDesc = _("unknown reason");
    1487             : 
    1488          20 :     switch (reason)
    1489             :     {
    1490           8 :         case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
    1491           8 :             reasonDesc = _("recovery conflict on buffer pin");
    1492           8 :             break;
    1493           4 :         case PROCSIG_RECOVERY_CONFLICT_LOCK:
    1494           4 :             reasonDesc = _("recovery conflict on lock");
    1495           4 :             break;
    1496           4 :         case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
    1497           4 :             reasonDesc = _("recovery conflict on tablespace");
    1498           4 :             break;
    1499           4 :         case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
    1500           4 :             reasonDesc = _("recovery conflict on snapshot");
    1501           4 :             break;
    1502           0 :         case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT:
    1503           0 :             reasonDesc = _("recovery conflict on replication slot");
    1504           0 :             break;
    1505           0 :         case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
    1506           0 :             reasonDesc = _("recovery conflict on buffer deadlock");
    1507           0 :             break;
    1508           0 :         case PROCSIG_RECOVERY_CONFLICT_DATABASE:
    1509           0 :             reasonDesc = _("recovery conflict on database");
    1510           0 :             break;
    1511           0 :         default:
    1512           0 :             break;
    1513             :     }
    1514             : 
    1515          20 :     return reasonDesc;
    1516             : }

Generated by: LCOV version 1.14