LCOV - code coverage report
Current view: top level - src/backend/storage/ipc - standby.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 91.3 % 381 348
Test Date: 2026-04-06 13:16:11 Functions: 100.0 % 31 31
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * standby.c
       4              :  *    Misc functions used in Hot Standby mode.
       5              :  *
       6              :  *  All functions for handling RM_STANDBY_ID, which relate to
       7              :  *  AccessExclusiveLocks and starting snapshots for Hot Standby mode.
       8              :  *  Plus conflict recovery processing.
       9              :  *
      10              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      11              :  * Portions Copyright (c) 1994, Regents of the University of California
      12              :  *
      13              :  * IDENTIFICATION
      14              :  *    src/backend/storage/ipc/standby.c
      15              :  *
      16              :  *-------------------------------------------------------------------------
      17              :  */
      18              : #include "postgres.h"
      19              : #include "access/transam.h"
      20              : #include "access/twophase.h"
      21              : #include "access/xact.h"
      22              : #include "access/xloginsert.h"
      23              : #include "access/xlogrecovery.h"
      24              : #include "access/xlogutils.h"
      25              : #include "miscadmin.h"
      26              : #include "pgstat.h"
      27              : #include "replication/slot.h"
      28              : #include "storage/bufmgr.h"
      29              : #include "storage/proc.h"
      30              : #include "storage/procarray.h"
      31              : #include "storage/sinvaladt.h"
      32              : #include "storage/standby.h"
      33              : #include "utils/hsearch.h"
      34              : #include "utils/injection_point.h"
      35              : #include "utils/ps_status.h"
      36              : #include "utils/timeout.h"
      37              : #include "utils/timestamp.h"
      38              : #include "utils/wait_event.h"
      39              : 
      40              : /* User-settable GUC parameters */
      41              : int         max_standby_archive_delay = 30 * 1000;
      42              : int         max_standby_streaming_delay = 30 * 1000;
      43              : bool        log_recovery_conflict_waits = false;
      44              : 
      45              : /*
      46              :  * Keep track of all the exclusive locks owned by original transactions.
      47              :  * For each known exclusive lock, there is a RecoveryLockEntry in the
      48              :  * RecoveryLockHash hash table.  All RecoveryLockEntrys belonging to a
      49              :  * given XID are chained together so that we can find them easily.
      50              :  * For each original transaction that is known to have any such locks,
      51              :  * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
      52              :  * which stores the head of the chain of its locks.
      53              :  */
      54              : typedef struct RecoveryLockEntry
      55              : {
      56              :     xl_standby_lock key;        /* hash key: xid, dbOid, relOid */
      57              :     struct RecoveryLockEntry *next; /* chain link */
      58              : } RecoveryLockEntry;
      59              : 
      60              : typedef struct RecoveryLockXidEntry
      61              : {
      62              :     TransactionId xid;          /* hash key -- must be first */
      63              :     struct RecoveryLockEntry *head; /* chain head */
      64              : } RecoveryLockXidEntry;
      65              : 
      66              : static HTAB *RecoveryLockHash = NULL;
      67              : static HTAB *RecoveryLockXidHash = NULL;
      68              : 
      69              : /* Flags set by timeout handlers */
      70              : static volatile sig_atomic_t got_standby_deadlock_timeout = false;
      71              : static volatile sig_atomic_t got_standby_delay_timeout = false;
      72              : static volatile sig_atomic_t got_standby_lock_timeout = false;
      73              : 
      74              : static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
      75              :                                                    RecoveryConflictReason reason,
      76              :                                                    uint32 wait_event_info,
      77              :                                                    bool report_waiting);
      78              : static void SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason);
      79              : static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
      80              : static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
      81              : static const char *get_recovery_conflict_desc(RecoveryConflictReason reason);
      82              : 
      83              : /*
      84              :  * InitRecoveryTransactionEnvironment
      85              :  *      Initialize tracking of our primary's in-progress transactions.
      86              :  *
      87              :  * We need to issue shared invalidations and hold locks. Holding locks
      88              :  * means others may want to wait on us, so we need to make a lock table
      89              :  * vxact entry like a real transaction. We could create and delete
      90              :  * lock table entries for each transaction but its simpler just to create
      91              :  * one permanent entry and leave it there all the time. Locks are then
      92              :  * acquired and released as needed. Yes, this means you can see the
      93              :  * Startup process in pg_locks once we have run this.
      94              :  */
      95              : void
      96          116 : InitRecoveryTransactionEnvironment(void)
      97              : {
      98              :     VirtualTransactionId vxid;
      99              :     HASHCTL     hash_ctl;
     100              : 
     101              :     Assert(RecoveryLockHash == NULL);   /* don't run this twice */
     102              : 
     103              :     /*
     104              :      * Initialize the hash tables for tracking the locks held by each
     105              :      * transaction.
     106              :      */
     107          116 :     hash_ctl.keysize = sizeof(xl_standby_lock);
     108          116 :     hash_ctl.entrysize = sizeof(RecoveryLockEntry);
     109          116 :     RecoveryLockHash = hash_create("RecoveryLockHash",
     110              :                                    64,
     111              :                                    &hash_ctl,
     112              :                                    HASH_ELEM | HASH_BLOBS);
     113          116 :     hash_ctl.keysize = sizeof(TransactionId);
     114          116 :     hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
     115          116 :     RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
     116              :                                       64,
     117              :                                       &hash_ctl,
     118              :                                       HASH_ELEM | HASH_BLOBS);
     119              : 
     120              :     /*
     121              :      * Initialize shared invalidation management for Startup process, being
     122              :      * careful to register ourselves as a sendOnly process so we don't need to
     123              :      * read messages, nor will we get signaled when the queue starts filling
     124              :      * up.
     125              :      */
     126          116 :     SharedInvalBackendInit(true);
     127              : 
     128              :     /*
     129              :      * Lock a virtual transaction id for Startup process.
     130              :      *
     131              :      * We need to do GetNextLocalTransactionId() because
     132              :      * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
     133              :      * manager doesn't like that at all.
     134              :      *
     135              :      * Note that we don't need to run XactLockTableInsert() because nobody
     136              :      * needs to wait on xids. That sounds a little strange, but table locks
     137              :      * are held by vxids and row level locks are held by xids. All queries
     138              :      * hold AccessShareLocks so never block while we write or lock new rows.
     139              :      */
     140          116 :     MyProc->vxid.procNumber = MyProcNumber;
     141          116 :     vxid.procNumber = MyProcNumber;
     142          116 :     vxid.localTransactionId = GetNextLocalTransactionId();
     143          116 :     VirtualXactLockTableInsert(vxid);
     144              : 
     145          116 :     standbyState = STANDBY_INITIALIZED;
     146          116 : }
     147              : 
     148              : /*
     149              :  * ShutdownRecoveryTransactionEnvironment
     150              :  *      Shut down transaction tracking
     151              :  *
     152              :  * Prepare to switch from hot standby mode to normal operation. Shut down
     153              :  * recovery-time transaction tracking.
     154              :  *
     155              :  * This must be called even in shutdown of startup process if transaction
     156              :  * tracking has been initialized. Otherwise some locks the tracked
     157              :  * transactions were holding will not be released and may interfere with
     158              :  * the processes still running (but will exit soon later) at the exit of
     159              :  * startup process.
     160              :  */
     161              : void
     162          171 : ShutdownRecoveryTransactionEnvironment(void)
     163              : {
     164              :     /*
     165              :      * Do nothing if RecoveryLockHash is NULL because that means that
     166              :      * transaction tracking has not yet been initialized or has already been
     167              :      * shut down.  This makes it safe to have possibly-redundant calls of this
     168              :      * function during process exit.
     169              :      */
     170          171 :     if (RecoveryLockHash == NULL)
     171           55 :         return;
     172              : 
     173              :     /* Mark all tracked in-progress transactions as finished. */
     174          116 :     ExpireAllKnownAssignedTransactionIds();
     175              : 
     176              :     /* Release all locks the tracked transactions were holding */
     177          116 :     StandbyReleaseAllLocks();
     178              : 
     179              :     /* Destroy the lock hash tables. */
     180          116 :     hash_destroy(RecoveryLockHash);
     181          116 :     hash_destroy(RecoveryLockXidHash);
     182          116 :     RecoveryLockHash = NULL;
     183          116 :     RecoveryLockXidHash = NULL;
     184              : 
     185              :     /* Cleanup our VirtualTransaction */
     186          116 :     VirtualXactLockTableCleanup();
     187              : }
     188              : 
     189              : 
     190              : /*
     191              :  * -----------------------------------------------------
     192              :  *      Standby wait timers and backend cancel logic
     193              :  * -----------------------------------------------------
     194              :  */
     195              : 
     196              : /*
     197              :  * Determine the cutoff time at which we want to start canceling conflicting
     198              :  * transactions.  Returns zero (a time safely in the past) if we are willing
     199              :  * to wait forever.
     200              :  */
     201              : static TimestampTz
     202           28 : GetStandbyLimitTime(void)
     203              : {
     204              :     TimestampTz rtime;
     205              :     bool        fromStream;
     206              : 
     207              :     /*
     208              :      * The cutoff time is the last WAL data receipt time plus the appropriate
     209              :      * delay variable.  Delay of -1 means wait forever.
     210              :      */
     211           28 :     GetXLogReceiptTime(&rtime, &fromStream);
     212           28 :     if (fromStream)
     213              :     {
     214           28 :         if (max_standby_streaming_delay < 0)
     215            0 :             return 0;           /* wait forever */
     216           28 :         return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
     217              :     }
     218              :     else
     219              :     {
     220            0 :         if (max_standby_archive_delay < 0)
     221            0 :             return 0;           /* wait forever */
     222            0 :         return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
     223              :     }
     224              : }
     225              : 
     226              : #define STANDBY_INITIAL_WAIT_US  1000
     227              : static int  standbyWait_us = STANDBY_INITIAL_WAIT_US;
     228              : 
     229              : /*
     230              :  * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
     231              :  * We wait here for a while then return. If we decide we can't wait any
     232              :  * more then we return true, if we can wait some more return false.
     233              :  */
     234              : static bool
     235           15 : WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
     236              : {
     237              :     TimestampTz ltime;
     238              : 
     239           15 :     CHECK_FOR_INTERRUPTS();
     240              : 
     241              :     /* Are we past the limit time? */
     242           15 :     ltime = GetStandbyLimitTime();
     243           15 :     if (ltime && GetCurrentTimestamp() >= ltime)
     244            3 :         return true;
     245              : 
     246              :     /*
     247              :      * Sleep a bit (this is essential to avoid busy-waiting).
     248              :      */
     249           12 :     pgstat_report_wait_start(wait_event_info);
     250           12 :     pg_usleep(standbyWait_us);
     251           12 :     pgstat_report_wait_end();
     252              : 
     253              :     /*
     254              :      * Progressively increase the sleep times, but not to more than 1s, since
     255              :      * pg_usleep isn't interruptible on some platforms.
     256              :      */
     257           12 :     standbyWait_us *= 2;
     258           12 :     if (standbyWait_us > 1000000)
     259            0 :         standbyWait_us = 1000000;
     260              : 
     261           12 :     return false;
     262              : }
     263              : 
     264              : /*
     265              :  * Log the recovery conflict.
     266              :  *
     267              :  * wait_start is the timestamp when the caller started to wait.
     268              :  * now is the timestamp when this function has been called.
     269              :  * wait_list is the list of virtual transaction ids assigned to
     270              :  * conflicting processes. still_waiting indicates whether
     271              :  * the startup process is still waiting for the recovery conflict
     272              :  * to be resolved or not.
     273              :  */
     274              : void
     275           10 : LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start,
     276              :                     TimestampTz now, VirtualTransactionId *wait_list,
     277              :                     bool still_waiting)
     278              : {
     279              :     long        secs;
     280              :     int         usecs;
     281              :     long        msecs;
     282              :     StringInfoData buf;
     283           10 :     int         nprocs = 0;
     284              : 
     285              :     /*
     286              :      * There must be no conflicting processes when the recovery conflict has
     287              :      * already been resolved.
     288              :      */
     289              :     Assert(still_waiting || wait_list == NULL);
     290              : 
     291           10 :     TimestampDifference(wait_start, now, &secs, &usecs);
     292           10 :     msecs = secs * 1000 + usecs / 1000;
     293           10 :     usecs = usecs % 1000;
     294              : 
     295           10 :     if (wait_list)
     296              :     {
     297              :         VirtualTransactionId *vxids;
     298              : 
     299              :         /* Construct a string of list of the conflicting processes */
     300            3 :         vxids = wait_list;
     301            6 :         while (VirtualTransactionIdIsValid(*vxids))
     302              :         {
     303            3 :             PGPROC     *proc = ProcNumberGetProc(vxids->procNumber);
     304              : 
     305              :             /* proc can be NULL if the target backend is not active */
     306            3 :             if (proc)
     307              :             {
     308            3 :                 if (nprocs == 0)
     309              :                 {
     310            3 :                     initStringInfo(&buf);
     311            3 :                     appendStringInfo(&buf, "%d", proc->pid);
     312              :                 }
     313              :                 else
     314            0 :                     appendStringInfo(&buf, ", %d", proc->pid);
     315              : 
     316            3 :                 nprocs++;
     317              :             }
     318              : 
     319            3 :             vxids++;
     320              :         }
     321              :     }
     322              : 
     323              :     /*
     324              :      * If wait_list is specified, report the list of PIDs of active
     325              :      * conflicting backends in a detail message. Note that if all the backends
     326              :      * in the list are not active, no detail message is logged.
     327              :      */
     328           10 :     if (still_waiting)
     329              :     {
     330            5 :         ereport(LOG,
     331              :                 errmsg("recovery still waiting after %ld.%03d ms: %s",
     332              :                        msecs, usecs, get_recovery_conflict_desc(reason)),
     333              :                 nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
     334              :                                                   "Conflicting processes: %s.",
     335              :                                                   nprocs, buf.data) : 0);
     336              :     }
     337              :     else
     338              :     {
     339            5 :         ereport(LOG,
     340              :                 errmsg("recovery finished waiting after %ld.%03d ms: %s",
     341              :                        msecs, usecs, get_recovery_conflict_desc(reason)));
     342              :     }
     343              : 
     344           10 :     if (nprocs > 0)
     345            3 :         pfree(buf.data);
     346           10 : }
     347              : 
     348              : /*
     349              :  * This is the main executioner for any query backend that conflicts with
     350              :  * recovery processing. Judgement has already been passed on it within
     351              :  * a specific rmgr. Here we just issue the orders to the procs. The procs
     352              :  * then throw the required error as instructed.
     353              :  *
     354              :  * If report_waiting is true, "waiting" is reported in PS display and the
     355              :  * wait for recovery conflict is reported in the log, if necessary. If
     356              :  * the caller is responsible for reporting them, report_waiting should be
     357              :  * false. Otherwise, both the caller and this function report the same
     358              :  * thing unexpectedly.
     359              :  */
     360              : static void
     361        16250 : ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
     362              :                                        RecoveryConflictReason reason,
     363              :                                        uint32 wait_event_info,
     364              :                                        bool report_waiting)
     365              : {
     366        16250 :     TimestampTz waitStart = 0;
     367        16250 :     bool        waiting = false;
     368        16250 :     bool        logged_recovery_conflict = false;
     369              : 
     370              :     /* Fast exit, to avoid a kernel call if there's no work to be done. */
     371        16250 :     if (!VirtualTransactionIdIsValid(*waitlist))
     372        16247 :         return;
     373              : 
     374              :     /* Set the wait start timestamp for reporting */
     375            3 :     if (report_waiting && (log_recovery_conflict_waits || update_process_title))
     376            2 :         waitStart = GetCurrentTimestamp();
     377              : 
     378            6 :     while (VirtualTransactionIdIsValid(*waitlist))
     379              :     {
     380              :         /* reset standbyWait_us for each xact we wait for */
     381            3 :         standbyWait_us = STANDBY_INITIAL_WAIT_US;
     382              : 
     383              :         /* wait until the virtual xid is gone */
     384           18 :         while (!VirtualXactLock(*waitlist, false))
     385              :         {
     386              :             /* Is it time to kill it? */
     387           15 :             if (WaitExceedsMaxStandbyDelay(wait_event_info))
     388              :             {
     389              :                 bool        signaled;
     390              : 
     391              :                 /*
     392              :                  * Now find out who to throw out of the balloon.
     393              :                  */
     394              :                 Assert(VirtualTransactionIdIsValid(*waitlist));
     395            3 :                 signaled = SignalRecoveryConflictWithVirtualXID(*waitlist, reason);
     396              : 
     397              :                 /*
     398              :                  * Wait a little bit for it to die so that we avoid flooding
     399              :                  * an unresponsive backend when system is heavily loaded.
     400              :                  */
     401            3 :                 if (signaled)
     402            3 :                     pg_usleep(5000L);
     403              :             }
     404              : 
     405           15 :             if (waitStart != 0 && (!logged_recovery_conflict || !waiting))
     406              :             {
     407           14 :                 TimestampTz now = 0;
     408              :                 bool        maybe_log_conflict;
     409              :                 bool        maybe_update_title;
     410              : 
     411           14 :                 maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
     412           14 :                 maybe_update_title = (update_process_title && !waiting);
     413              : 
     414              :                 /* Get the current timestamp if not report yet */
     415           14 :                 if (maybe_log_conflict || maybe_update_title)
     416           14 :                     now = GetCurrentTimestamp();
     417              : 
     418              :                 /*
     419              :                  * Report via ps if we have been waiting for more than 500
     420              :                  * msec (should that be configurable?)
     421              :                  */
     422           28 :                 if (maybe_update_title &&
     423           14 :                     TimestampDifferenceExceeds(waitStart, now, 500))
     424              :                 {
     425            0 :                     set_ps_display_suffix("waiting");
     426            0 :                     waiting = true;
     427              :                 }
     428              : 
     429              :                 /*
     430              :                  * Emit the log message if the startup process is waiting
     431              :                  * longer than deadlock_timeout for recovery conflict.
     432              :                  */
     433           22 :                 if (maybe_log_conflict &&
     434            8 :                     TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
     435              :                 {
     436            2 :                     LogRecoveryConflict(reason, waitStart, now, waitlist, true);
     437            2 :                     logged_recovery_conflict = true;
     438              :                 }
     439              :             }
     440              :         }
     441              : 
     442              :         /* The virtual transaction is gone now, wait for the next one */
     443            3 :         waitlist++;
     444              :     }
     445              : 
     446              :     /*
     447              :      * Emit the log message if recovery conflict was resolved but the startup
     448              :      * process waited longer than deadlock_timeout for it.
     449              :      */
     450            3 :     if (logged_recovery_conflict)
     451            2 :         LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
     452              :                             NULL, false);
     453              : 
     454              :     /* reset ps display to remove the suffix if we added one */
     455            3 :     if (waiting)
     456            0 :         set_ps_display_remove_suffix();
     457              : 
     458              : }
     459              : 
     460              : /*
     461              :  * Generate whatever recovery conflicts are needed to eliminate snapshots that
     462              :  * might see XIDs <= snapshotConflictHorizon as still running.
     463              :  *
     464              :  * snapshotConflictHorizon cutoffs are our standard approach to generating
     465              :  * granular recovery conflicts.  Note that InvalidTransactionId values are
     466              :  * interpreted as "definitely don't need any conflicts" here, which is a
     467              :  * general convention that WAL records can (and often do) depend on.
     468              :  */
     469              : void
     470        17062 : ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
     471              :                                     bool isCatalogRel,
     472              :                                     RelFileLocator locator)
     473              : {
     474              :     VirtualTransactionId *backends;
     475              : 
     476              :     /*
     477              :      * If we get passed InvalidTransactionId then we do nothing (no conflict).
     478              :      *
     479              :      * This can happen whenever the changes in the WAL record do not affect
     480              :      * visibility on a standby. For example: a record that only freezes an
     481              :      * xmax from a locker.
     482              :      *
     483              :      * It's also quite common with records generated during index deletion
     484              :      * (original execution of the deletion can reason that a recovery conflict
     485              :      * which is sufficient for the deletion operation must take place before
     486              :      * replay of the deletion record itself).
     487              :      */
     488        17062 :     if (!TransactionIdIsValid(snapshotConflictHorizon))
     489          814 :         return;
     490              : 
     491              :     Assert(TransactionIdIsNormal(snapshotConflictHorizon));
     492        16248 :     backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
     493              :                                          locator.dbOid);
     494        16248 :     ResolveRecoveryConflictWithVirtualXIDs(backends,
     495              :                                            RECOVERY_CONFLICT_SNAPSHOT,
     496              :                                            WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
     497              :                                            true);
     498              : 
     499              :     /*
     500              :      * Note that WaitExceedsMaxStandbyDelay() is not taken into account here
     501              :      * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That
     502              :      * seems OK, given that this kind of conflict should not normally be
     503              :      * reached, e.g. due to using a physical replication slot.
     504              :      */
     505        16248 :     if (IsLogicalDecodingEnabled() && isCatalogRel)
     506           15 :         InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid,
     507              :                                            snapshotConflictHorizon);
     508              : }
     509              : 
     510              : /*
     511              :  * Variant of ResolveRecoveryConflictWithSnapshot that works with
     512              :  * FullTransactionId values
     513              :  */
     514              : void
     515           53 : ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
     516              :                                            bool isCatalogRel,
     517              :                                            RelFileLocator locator)
     518              : {
     519              :     /*
     520              :      * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
     521              :      * so truncate the logged FullTransactionId.  If the logged value is very
     522              :      * old, so that XID wrap-around already happened on it, there can't be any
     523              :      * snapshots that still see it.
     524              :      */
     525           53 :     FullTransactionId nextXid = ReadNextFullTransactionId();
     526              :     uint64      diff;
     527              : 
     528           53 :     diff = U64FromFullTransactionId(nextXid) -
     529           53 :         U64FromFullTransactionId(snapshotConflictHorizon);
     530           53 :     if (diff < MaxTransactionId / 2)
     531              :     {
     532              :         TransactionId truncated;
     533              : 
     534           53 :         truncated = XidFromFullTransactionId(snapshotConflictHorizon);
     535           53 :         ResolveRecoveryConflictWithSnapshot(truncated,
     536              :                                             isCatalogRel,
     537              :                                             locator);
     538              :     }
     539           53 : }
     540              : 
     541              : void
     542            1 : ResolveRecoveryConflictWithTablespace(Oid tsid)
     543              : {
     544              :     VirtualTransactionId *temp_file_users;
     545              : 
     546              :     /*
     547              :      * Standby users may be currently using this tablespace for their
     548              :      * temporary files. We only care about current users because
     549              :      * temp_tablespace parameter will just ignore tablespaces that no longer
     550              :      * exist.
     551              :      *
     552              :      * Ask everybody to cancel their queries immediately so we can ensure no
     553              :      * temp files remain and we can remove the tablespace. Nuke the entire
     554              :      * site from orbit, it's the only way to be sure.
     555              :      *
     556              :      * XXX: We could work out the pids of active backends using this
     557              :      * tablespace by examining the temp filenames in the directory. We would
     558              :      * then convert the pids into VirtualXIDs before attempting to cancel
     559              :      * them.
     560              :      *
     561              :      * We don't wait for commit because drop tablespace is non-transactional.
     562              :      */
     563            1 :     temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
     564              :                                                 InvalidOid);
     565            1 :     ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
     566              :                                            RECOVERY_CONFLICT_TABLESPACE,
     567              :                                            WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
     568              :                                            true);
     569            1 : }
     570              : 
     571              : void
     572           16 : ResolveRecoveryConflictWithDatabase(Oid dbid)
     573              : {
     574              :     /*
     575              :      * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
     576              :      * only waits for transactions and completely idle sessions would block
     577              :      * us. This is rare enough that we do this as simply as possible: no wait,
     578              :      * just force them off immediately.
     579              :      *
     580              :      * No locking is required here because we already acquired
     581              :      * AccessExclusiveLock. Anybody trying to connect while we do this will
     582              :      * block during InitPostgres() and then disconnect when they see the
     583              :      * database has been removed.
     584              :      */
     585           18 :     while (CountDBBackends(dbid) > 0)
     586              :     {
     587            2 :         SignalRecoveryConflictWithDatabase(dbid, RECOVERY_CONFLICT_DATABASE);
     588              : 
     589              :         /*
     590              :          * Wait awhile for them to die so that we avoid flooding an
     591              :          * unresponsive backend when system is heavily loaded.
     592              :          */
     593            2 :         pg_usleep(10000);
     594              :     }
     595           16 : }
     596              : 
     597              : /*
     598              :  * ResolveRecoveryConflictWithLock is called from ProcSleep()
     599              :  * to resolve conflicts with other backends holding relation locks.
     600              :  *
     601              :  * The WaitLatch sleep normally done in ProcSleep()
     602              :  * (when not InHotStandby) is performed here, for code clarity.
     603              :  *
     604              :  * We either resolve conflicts immediately or set a timeout to wake us at
     605              :  * the limit of our patience.
     606              :  *
     607              :  * Resolve conflicts by canceling to all backends holding a conflicting
     608              :  * lock.  As we are already queued to be granted the lock, no new lock
     609              :  * requests conflicting with ours will be granted in the meantime.
     610              :  *
     611              :  * We also must check for deadlocks involving the Startup process and
     612              :  * hot-standby backend processes. If deadlock_timeout is reached in
     613              :  * this function, all the backends holding the conflicting locks are
     614              :  * requested to check themselves for deadlocks.
     615              :  *
     616              :  * logging_conflict should be true if the recovery conflict has not been
     617              :  * logged yet even though logging is enabled. After deadlock_timeout is
     618              :  * reached and the request for deadlock check is sent, we wait again to
     619              :  * be signaled by the release of the lock if logging_conflict is false.
     620              :  * Otherwise we return without waiting again so that the caller can report
     621              :  * the recovery conflict. In this case, then, this function is called again
     622              :  * with logging_conflict=false (because the recovery conflict has already
     623              :  * been logged) and we will wait again for the lock to be released.
     624              :  */
     625              : void
     626            3 : ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
     627              : {
     628              :     TimestampTz ltime;
     629              :     TimestampTz now;
     630              : 
     631              :     Assert(InHotStandby);
     632              : 
     633            3 :     ltime = GetStandbyLimitTime();
     634            3 :     now = GetCurrentTimestamp();
     635              : 
     636              :     /*
     637              :      * Update waitStart if first time through after the startup process
     638              :      * started waiting for the lock. It should not be updated every time
     639              :      * ResolveRecoveryConflictWithLock() is called during the wait.
     640              :      *
     641              :      * Use the current time obtained for comparison with ltime as waitStart
     642              :      * (i.e., the time when this process started waiting for the lock). Since
     643              :      * getting the current time newly can cause overhead, we reuse the
     644              :      * already-obtained time to avoid that overhead.
     645              :      *
     646              :      * Note that waitStart is updated without holding the lock table's
     647              :      * partition lock, to avoid the overhead by additional lock acquisition.
     648              :      * This can cause "waitstart" in pg_locks to become NULL for a very short
     649              :      * period of time after the wait started even though "granted" is false.
     650              :      * This is OK in practice because we can assume that users are likely to
     651              :      * look at "waitstart" when waiting for the lock for a long time.
     652              :      */
     653            3 :     if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
     654            1 :         pg_atomic_write_u64(&MyProc->waitStart, now);
     655              : 
     656            3 :     if (now >= ltime && ltime != 0)
     657            1 :     {
     658              :         /*
     659              :          * We're already behind, so clear a path as quickly as possible.
     660              :          */
     661              :         VirtualTransactionId *backends;
     662              : 
     663            1 :         backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
     664              : 
     665              :         /*
     666              :          * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
     667              :          * "waiting" in PS display by disabling its argument report_waiting
     668              :          * because the caller, WaitOnLock(), has already reported that.
     669              :          */
     670            1 :         ResolveRecoveryConflictWithVirtualXIDs(backends,
     671              :                                                RECOVERY_CONFLICT_LOCK,
     672            1 :                                                PG_WAIT_LOCK | locktag.locktag_type,
     673              :                                                false);
     674              :     }
     675              :     else
     676              :     {
     677              :         /*
     678              :          * Wait (or wait again) until ltime, and check for deadlocks as well
     679              :          * if we will be waiting longer than deadlock_timeout
     680              :          */
     681              :         EnableTimeoutParams timeouts[2];
     682            2 :         int         cnt = 0;
     683              : 
     684            2 :         if (ltime != 0)
     685              :         {
     686            2 :             got_standby_lock_timeout = false;
     687            2 :             timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
     688            2 :             timeouts[cnt].type = TMPARAM_AT;
     689            2 :             timeouts[cnt].fin_time = ltime;
     690            2 :             cnt++;
     691              :         }
     692              : 
     693            2 :         got_standby_deadlock_timeout = false;
     694            2 :         timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
     695            2 :         timeouts[cnt].type = TMPARAM_AFTER;
     696            2 :         timeouts[cnt].delay_ms = DeadlockTimeout;
     697            2 :         cnt++;
     698              : 
     699            2 :         enable_timeouts(timeouts, cnt);
     700              :     }
     701              : 
     702              :     /* Wait to be signaled by the release of the Relation Lock */
     703            3 :     ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
     704              : 
     705              :     /*
     706              :      * Exit if ltime is reached. Then all the backends holding conflicting
     707              :      * locks will be canceled in the next ResolveRecoveryConflictWithLock()
     708              :      * call.
     709              :      */
     710            3 :     if (got_standby_lock_timeout)
     711            0 :         goto cleanup;
     712              : 
     713            3 :     if (got_standby_deadlock_timeout)
     714              :     {
     715              :         VirtualTransactionId *backends;
     716              : 
     717            2 :         backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
     718              : 
     719              :         /* Quick exit if there's no work to be done */
     720            2 :         if (!VirtualTransactionIdIsValid(*backends))
     721            0 :             goto cleanup;
     722              : 
     723              :         /*
     724              :          * Send signals to all the backends holding the conflicting locks, to
     725              :          * ask them to check themselves for deadlocks.
     726              :          */
     727            4 :         while (VirtualTransactionIdIsValid(*backends))
     728              :         {
     729            2 :             (void) SignalRecoveryConflictWithVirtualXID(*backends,
     730              :                                                         RECOVERY_CONFLICT_STARTUP_DEADLOCK);
     731            2 :             backends++;
     732              :         }
     733              : 
     734              :         /*
     735              :          * Exit if the recovery conflict has not been logged yet even though
     736              :          * logging is enabled, so that the caller can log that. Then
     737              :          * RecoveryConflictWithLock() is called again and we will wait again
     738              :          * for the lock to be released.
     739              :          */
     740            2 :         if (logging_conflict)
     741            1 :             goto cleanup;
     742              : 
     743              :         /*
     744              :          * Wait again here to be signaled by the release of the Relation Lock,
     745              :          * to prevent the subsequent RecoveryConflictWithLock() from causing
     746              :          * deadlock_timeout and sending a request for deadlocks check again.
     747              :          * Otherwise the request continues to be sent every deadlock_timeout
     748              :          * until the relation locks are released or ltime is reached.
     749              :          */
     750            1 :         got_standby_deadlock_timeout = false;
     751            1 :         ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
     752              :     }
     753              : 
     754            1 : cleanup:
     755              : 
     756              :     /*
     757              :      * Clear any timeout requests established above.  We assume here that the
     758              :      * Startup process doesn't have any other outstanding timeouts than those
     759              :      * used by this function. If that stops being true, we could cancel the
     760              :      * timeouts individually, but that'd be slower.
     761              :      */
     762            3 :     disable_all_timeouts(false);
     763            3 :     got_standby_lock_timeout = false;
     764            3 :     got_standby_deadlock_timeout = false;
     765            3 : }
     766              : 
     767              : /*
     768              :  * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
     769              :  * to resolve conflicts with other backends holding buffer pins.
     770              :  *
     771              :  * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
     772              :  * (when not InHotStandby) is performed here, for code clarity.
     773              :  *
     774              :  * We either resolve conflicts immediately or set a timeout to wake us at
     775              :  * the limit of our patience.
     776              :  *
     777              :  * Resolve conflicts by sending a PROCSIG signal to all backends to check if
     778              :  * they hold one of the buffer pins that is blocking Startup process. If so,
     779              :  * those backends will take an appropriate error action, ERROR or FATAL.
     780              :  *
     781              :  * We also must check for deadlocks.  Deadlocks occur because if queries
     782              :  * wait on a lock, that must be behind an AccessExclusiveLock, which can only
     783              :  * be cleared if the Startup process replays a transaction completion record.
     784              :  * If Startup process is also waiting then that is a deadlock. The deadlock
     785              :  * can occur if the query is waiting and then the Startup sleeps, or if
     786              :  * Startup is sleeping and the query waits on a lock. We protect against
     787              :  * only the former sequence here, the latter sequence is checked prior to
     788              :  * the query sleeping, in CheckRecoveryConflictDeadlock().
     789              :  *
     790              :  * Deadlocks are extremely rare, and relatively expensive to check for,
     791              :  * so we don't do a deadlock check right away ... only if we have had to wait
     792              :  * at least deadlock_timeout.
     793              :  */
     794              : void
     795           10 : ResolveRecoveryConflictWithBufferPin(void)
     796              : {
     797              :     TimestampTz ltime;
     798              : 
     799              :     Assert(InHotStandby);
     800              : 
     801           10 :     ltime = GetStandbyLimitTime();
     802              : 
     803           10 :     if (GetCurrentTimestamp() >= ltime && ltime != 0)
     804              :     {
     805              :         /*
     806              :          * We're already behind, so clear a path as quickly as possible.
     807              :          */
     808            1 :         SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
     809              :     }
     810              :     else
     811              :     {
     812              :         /*
     813              :          * Wake up at ltime, and check for deadlocks as well if we will be
     814              :          * waiting longer than deadlock_timeout
     815              :          */
     816              :         EnableTimeoutParams timeouts[2];
     817            9 :         int         cnt = 0;
     818              : 
     819            9 :         if (ltime != 0)
     820              :         {
     821            9 :             timeouts[cnt].id = STANDBY_TIMEOUT;
     822            9 :             timeouts[cnt].type = TMPARAM_AT;
     823            9 :             timeouts[cnt].fin_time = ltime;
     824            9 :             cnt++;
     825              :         }
     826              : 
     827            9 :         got_standby_deadlock_timeout = false;
     828            9 :         timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
     829            9 :         timeouts[cnt].type = TMPARAM_AFTER;
     830            9 :         timeouts[cnt].delay_ms = DeadlockTimeout;
     831            9 :         cnt++;
     832              : 
     833            9 :         enable_timeouts(timeouts, cnt);
     834              :     }
     835              : 
     836              :     /*
     837              :      * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
     838              :      * by one of the timeouts established above.
     839              :      *
     840              :      * We assume that only UnpinBuffer() and the timeout requests established
     841              :      * above can wake us up here. WakeupRecovery() called by walreceiver or
     842              :      * SIGHUP signal handler, etc cannot do that because it uses the different
     843              :      * latch from that ProcWaitForSignal() waits on.
     844              :      */
     845           10 :     ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
     846              : 
     847           10 :     if (got_standby_delay_timeout)
     848            1 :         SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
     849            9 :     else if (got_standby_deadlock_timeout)
     850              :     {
     851              :         /*
     852              :          * Send out a request for hot-standby backends to check themselves for
     853              :          * deadlocks.
     854              :          *
     855              :          * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
     856              :          * to be signaled by UnpinBuffer() again and send a request for
     857              :          * deadlocks check if deadlock_timeout happens. This causes the
     858              :          * request to continue to be sent every deadlock_timeout until the
     859              :          * buffer is unpinned or ltime is reached. This would increase the
     860              :          * workload in the startup process and backends. In practice it may
     861              :          * not be so harmful because the period that the buffer is kept pinned
     862              :          * is basically no so long. But we should fix this?
     863              :          */
     864            6 :         SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
     865              :     }
     866              : 
     867              :     /*
     868              :      * Clear any timeout requests established above.  We assume here that the
     869              :      * Startup process doesn't have any other timeouts than what this function
     870              :      * uses.  If that stops being true, we could cancel the timeouts
     871              :      * individually, but that'd be slower.
     872              :      */
     873           10 :     disable_all_timeouts(false);
     874           10 :     got_standby_delay_timeout = false;
     875           10 :     got_standby_deadlock_timeout = false;
     876           10 : }
     877              : 
     878              : static void
     879            8 : SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason)
     880              : {
     881              :     Assert(reason == RECOVERY_CONFLICT_BUFFERPIN ||
     882              :            reason == RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
     883              : 
     884              :     /*
     885              :      * We send signal to all backends to ask them if they are holding the
     886              :      * buffer pin which is delaying the Startup process. Most of them will be
     887              :      * innocent, but we let the SIGUSR1 handling in each backend decide their
     888              :      * own fate.
     889              :      */
     890            8 :     SignalRecoveryConflictWithDatabase(InvalidOid, reason);
     891            8 : }
     892              : 
     893              : /*
     894              :  * In Hot Standby perform early deadlock detection.  We abort the lock
     895              :  * wait if we are about to sleep while holding the buffer pin that Startup
     896              :  * process is waiting for.
     897              :  *
     898              :  * Note: this code is pessimistic, because there is no way for it to
     899              :  * determine whether an actual deadlock condition is present: the lock we
     900              :  * need to wait for might be unrelated to any held by the Startup process.
     901              :  * Sooner or later, this mechanism should get ripped out in favor of somehow
     902              :  * accounting for buffer locks in DeadLockCheck().  However, errors here
     903              :  * seem to be very low-probability in practice, so for now it's not worth
     904              :  * the trouble.
     905              :  */
     906              : void
     907            1 : CheckRecoveryConflictDeadlock(void)
     908              : {
     909              :     Assert(!InRecovery);        /* do not call in Startup process */
     910              : 
     911            1 :     if (!HoldingBufferPinThatDelaysRecovery())
     912            1 :         return;
     913              : 
     914              :     /*
     915              :      * Error message should match ProcessInterrupts() but we avoid calling
     916              :      * that because we aren't handling an interrupt at this point. Note that
     917              :      * we only cancel the current transaction here, so if we are in a
     918              :      * subtransaction and the pin is held by a parent, then the Startup
     919              :      * process will continue to wait even though we have avoided deadlock.
     920              :      */
     921            0 :     ereport(ERROR,
     922              :             (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
     923              :              errmsg("canceling statement due to conflict with recovery"),
     924              :              errdetail("User transaction caused buffer deadlock with recovery.")));
     925              : }
     926              : 
     927              : 
     928              : /* --------------------------------
     929              :  *      timeout handler routines
     930              :  * --------------------------------
     931              :  */
     932              : 
     933              : /*
     934              :  * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
     935              :  * exceeded.
     936              :  */
     937              : void
     938            9 : StandbyDeadLockHandler(void)
     939              : {
     940            9 :     got_standby_deadlock_timeout = true;
     941            9 : }
     942              : 
     943              : /*
     944              :  * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
     945              :  */
     946              : void
     947            1 : StandbyTimeoutHandler(void)
     948              : {
     949            1 :     got_standby_delay_timeout = true;
     950            1 : }
     951              : 
     952              : /*
     953              :  * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
     954              :  */
     955              : void
     956            1 : StandbyLockTimeoutHandler(void)
     957              : {
     958            1 :     got_standby_lock_timeout = true;
     959            1 : }
     960              : 
     961              : /*
     962              :  * -----------------------------------------------------
     963              :  * Locking in Recovery Mode
     964              :  * -----------------------------------------------------
     965              :  *
     966              :  * All locks are held by the Startup process using a single virtual
     967              :  * transaction. This implementation is both simpler and in some senses,
     968              :  * more correct. The locks held mean "some original transaction held
     969              :  * this lock, so query access is not allowed at this time". So the Startup
     970              :  * process is the proxy by which the original locks are implemented.
     971              :  *
     972              :  * We only keep track of AccessExclusiveLocks, which are only ever held by
     973              :  * one transaction on one relation.
     974              :  *
     975              :  * We keep a table of known locks in the RecoveryLockHash hash table.
     976              :  * The point of that table is to let us efficiently de-duplicate locks,
     977              :  * which is important because checkpoints will re-report the same locks
     978              :  * already held.  There is also a RecoveryLockXidHash table with one entry
     979              :  * per xid, which allows us to efficiently find all the locks held by a
     980              :  * given original transaction.
     981              :  *
     982              :  * We use session locks rather than normal locks so we don't need
     983              :  * ResourceOwners.
     984              :  */
     985              : 
     986              : 
     987              : void
     988        28659 : StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
     989              : {
     990              :     RecoveryLockXidEntry *xidentry;
     991              :     RecoveryLockEntry *lockentry;
     992              :     xl_standby_lock key;
     993              :     LOCKTAG     locktag;
     994              :     bool        found;
     995              : 
     996              :     /* Already processed? */
     997        57318 :     if (!TransactionIdIsValid(xid) ||
     998        57281 :         TransactionIdDidCommit(xid) ||
     999        28622 :         TransactionIdDidAbort(xid))
    1000           41 :         return;
    1001              : 
    1002        28618 :     elog(DEBUG4, "adding recovery lock: db %u rel %u", dbOid, relOid);
    1003              : 
    1004              :     /* dbOid is InvalidOid when we are locking a shared relation. */
    1005              :     Assert(OidIsValid(relOid));
    1006              : 
    1007              :     /* Create a hash entry for this xid, if we don't have one already. */
    1008        28618 :     xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
    1009        28618 :     if (!found)
    1010              :     {
    1011              :         Assert(xidentry->xid == xid);    /* dynahash should have set this */
    1012        11556 :         xidentry->head = NULL;
    1013              :     }
    1014              : 
    1015              :     /* Create a hash entry for this lock, unless we have one already. */
    1016        28618 :     key.xid = xid;
    1017        28618 :     key.dbOid = dbOid;
    1018        28618 :     key.relOid = relOid;
    1019        28618 :     lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
    1020        28618 :     if (!found)
    1021              :     {
    1022              :         /* It's new, so link it into the XID's list ... */
    1023        27250 :         lockentry->next = xidentry->head;
    1024        27250 :         xidentry->head = lockentry;
    1025              : 
    1026              :         /* ... and acquire the lock locally. */
    1027        27250 :         SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
    1028              : 
    1029        27250 :         (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
    1030              :     }
    1031              : }
    1032              : 
    1033              : /*
    1034              :  * Release all the locks associated with this RecoveryLockXidEntry.
    1035              :  */
    1036              : static void
    1037        11556 : StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
    1038              : {
    1039              :     RecoveryLockEntry *entry;
    1040              :     RecoveryLockEntry *next;
    1041              : 
    1042        38806 :     for (entry = xidentry->head; entry != NULL; entry = next)
    1043              :     {
    1044              :         LOCKTAG     locktag;
    1045              : 
    1046        27250 :         elog(DEBUG4,
    1047              :              "releasing recovery lock: xid %u db %u rel %u",
    1048              :              entry->key.xid, entry->key.dbOid, entry->key.relOid);
    1049              :         /* Release the lock ... */
    1050        27250 :         SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
    1051        27250 :         if (!LockRelease(&locktag, AccessExclusiveLock, true))
    1052              :         {
    1053            0 :             elog(LOG,
    1054              :                  "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
    1055              :                  entry->key.xid, entry->key.dbOid, entry->key.relOid);
    1056              :             Assert(false);
    1057              :         }
    1058              :         /* ... and remove the per-lock hash entry */
    1059        27250 :         next = entry->next;
    1060        27250 :         hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
    1061              :     }
    1062              : 
    1063        11556 :     xidentry->head = NULL;       /* just for paranoia */
    1064        11556 : }
    1065              : 
    1066              : /*
    1067              :  * Release locks for specific XID, or all locks if it's InvalidXid.
    1068              :  */
    1069              : static void
    1070        12235 : StandbyReleaseLocks(TransactionId xid)
    1071              : {
    1072              :     RecoveryLockXidEntry *entry;
    1073              : 
    1074        12235 :     if (TransactionIdIsValid(xid))
    1075              :     {
    1076        12235 :         if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
    1077              :         {
    1078        11556 :             StandbyReleaseXidEntryLocks(entry);
    1079        11556 :             hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1080              :         }
    1081              :     }
    1082              :     else
    1083            0 :         StandbyReleaseAllLocks();
    1084        12235 : }
    1085              : 
    1086              : /*
    1087              :  * Release locks for a transaction tree, starting at xid down, from
    1088              :  * RecoveryLockXidHash.
    1089              :  *
    1090              :  * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
    1091              :  * to remove any AccessExclusiveLocks requested by a transaction.
    1092              :  */
    1093              : void
    1094        11735 : StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
    1095              : {
    1096              :     int         i;
    1097              : 
    1098        11735 :     StandbyReleaseLocks(xid);
    1099              : 
    1100        12235 :     for (i = 0; i < nsubxids; i++)
    1101          500 :         StandbyReleaseLocks(subxids[i]);
    1102        11735 : }
    1103              : 
    1104              : /*
    1105              :  * Called at end of recovery and when we see a shutdown checkpoint.
    1106              :  */
    1107              : void
    1108          116 : StandbyReleaseAllLocks(void)
    1109              : {
    1110              :     HASH_SEQ_STATUS status;
    1111              :     RecoveryLockXidEntry *entry;
    1112              : 
    1113          116 :     elog(DEBUG2, "release all standby locks");
    1114              : 
    1115          116 :     hash_seq_init(&status, RecoveryLockXidHash);
    1116          116 :     while ((entry = hash_seq_search(&status)))
    1117              :     {
    1118            0 :         StandbyReleaseXidEntryLocks(entry);
    1119            0 :         hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1120              :     }
    1121          116 : }
    1122              : 
    1123              : /*
    1124              :  * StandbyReleaseOldLocks
    1125              :  *      Release standby locks held by top-level XIDs that aren't running,
    1126              :  *      as long as they're not prepared transactions.
    1127              :  *
    1128              :  * This is needed to prune the locks of crashed transactions, which didn't
    1129              :  * write an ABORT/COMMIT record.
    1130              :  */
    1131              : void
    1132          832 : StandbyReleaseOldLocks(TransactionId oldxid)
    1133              : {
    1134              :     HASH_SEQ_STATUS status;
    1135              :     RecoveryLockXidEntry *entry;
    1136              : 
    1137          832 :     hash_seq_init(&status, RecoveryLockXidHash);
    1138         1123 :     while ((entry = hash_seq_search(&status)))
    1139              :     {
    1140              :         Assert(TransactionIdIsValid(entry->xid));
    1141              : 
    1142              :         /* Skip if prepared transaction. */
    1143          291 :         if (StandbyTransactionIdIsPrepared(entry->xid))
    1144            0 :             continue;
    1145              : 
    1146              :         /* Skip if >= oldxid. */
    1147          291 :         if (!TransactionIdPrecedes(entry->xid, oldxid))
    1148          291 :             continue;
    1149              : 
    1150              :         /* Remove all locks and hash table entry. */
    1151            0 :         StandbyReleaseXidEntryLocks(entry);
    1152            0 :         hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1153              :     }
    1154          832 : }
    1155              : 
    1156              : /*
    1157              :  * --------------------------------------------------------------------
    1158              :  *      Recovery handling for Rmgr RM_STANDBY_ID
    1159              :  *
    1160              :  * These record types will only be created if XLogStandbyInfoActive()
    1161              :  * --------------------------------------------------------------------
    1162              :  */
    1163              : 
    1164              : void
    1165        29033 : standby_redo(XLogReaderState *record)
    1166              : {
    1167        29033 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    1168              : 
    1169              :     /* Backup blocks are not used in standby records */
    1170              :     Assert(!XLogRecHasAnyBlockRefs(record));
    1171              : 
    1172              :     /* Do nothing if we're not in hot standby mode */
    1173        29033 :     if (standbyState == STANDBY_DISABLED)
    1174          162 :         return;
    1175              : 
    1176        28871 :     if (info == XLOG_STANDBY_LOCK)
    1177              :     {
    1178        27415 :         xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
    1179              :         int         i;
    1180              : 
    1181        56074 :         for (i = 0; i < xlrec->nlocks; i++)
    1182        28659 :             StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
    1183              :                                               xlrec->locks[i].dbOid,
    1184              :                                               xlrec->locks[i].relOid);
    1185              :     }
    1186         1456 :     else if (info == XLOG_RUNNING_XACTS)
    1187              :     {
    1188          768 :         xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
    1189              :         RunningTransactionsData running;
    1190              : 
    1191          768 :         running.xcnt = xlrec->xcnt;
    1192          768 :         running.subxcnt = xlrec->subxcnt;
    1193          768 :         running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
    1194          768 :         running.nextXid = xlrec->nextXid;
    1195          768 :         running.latestCompletedXid = xlrec->latestCompletedXid;
    1196          768 :         running.oldestRunningXid = xlrec->oldestRunningXid;
    1197          768 :         running.xids = xlrec->xids;
    1198              : 
    1199          768 :         ProcArrayApplyRecoveryInfo(&running);
    1200              : 
    1201              :         /*
    1202              :          * The startup process currently has no convenient way to schedule
    1203              :          * stats to be reported. XLOG_RUNNING_XACTS records issued at a
    1204              :          * regular cadence, making this a convenient location to report stats.
    1205              :          * While these records aren't generated with wal_level=minimal, stats
    1206              :          * also cannot be accessed during WAL replay.
    1207              :          */
    1208          768 :         pgstat_report_stat(true);
    1209              :     }
    1210          688 :     else if (info == XLOG_INVALIDATIONS)
    1211              :     {
    1212          688 :         xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
    1213              : 
    1214          688 :         ProcessCommittedInvalidationMessages(xlrec->msgs,
    1215              :                                              xlrec->nmsgs,
    1216          688 :                                              xlrec->relcacheInitFileInval,
    1217              :                                              xlrec->dbId,
    1218              :                                              xlrec->tsId);
    1219              :     }
    1220              :     else
    1221            0 :         elog(PANIC, "standby_redo: unknown op code %u", info);
    1222              : }
    1223              : 
    1224              : /*
    1225              :  * Log details of the current snapshot to WAL. This allows the snapshot state
    1226              :  * to be reconstructed on the standby and for logical decoding.
    1227              :  *
    1228              :  * This is used for Hot Standby as follows:
    1229              :  *
    1230              :  * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
    1231              :  * start from a shutdown checkpoint because we know nothing was running
    1232              :  * at that time and our recovery snapshot is known empty. In the more
    1233              :  * typical case of an online checkpoint we need to jump through a few
    1234              :  * hoops to get a correct recovery snapshot and this requires a two or
    1235              :  * sometimes a three stage process.
    1236              :  *
    1237              :  * The initial snapshot must contain all running xids and all current
    1238              :  * AccessExclusiveLocks at a point in time on the standby. Assembling
    1239              :  * that information while the server is running requires many and
    1240              :  * various LWLocks, so we choose to derive that information piece by
    1241              :  * piece and then re-assemble that info on the standby. When that
    1242              :  * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
    1243              :  *
    1244              :  * Since locking on the primary when we derive the information is not
    1245              :  * strict, we note that there is a time window between the derivation and
    1246              :  * writing to WAL of the derived information. That allows race conditions
    1247              :  * that we must resolve, since xids and locks may enter or leave the
    1248              :  * snapshot during that window. This creates the issue that an xid or
    1249              :  * lock may start *after* the snapshot has been derived yet *before* the
    1250              :  * snapshot is logged in the running xacts WAL record. We resolve this by
    1251              :  * starting to accumulate changes at a point just prior to when we derive
    1252              :  * the snapshot on the primary, then ignore duplicates when we later apply
    1253              :  * the snapshot from the running xacts record. This is implemented during
    1254              :  * CreateCheckPoint() where we use the logical checkpoint location as
    1255              :  * our starting point and then write the running xacts record immediately
    1256              :  * before writing the main checkpoint WAL record. Since we always start
    1257              :  * up from a checkpoint and are immediately at our starting point, we
    1258              :  * unconditionally move to STANDBY_INITIALIZED. After this point we
    1259              :  * must do 4 things:
    1260              :  *  * move shared nextXid forwards as we see new xids
    1261              :  *  * extend the clog and subtrans with each new xid
    1262              :  *  * keep track of uncommitted known assigned xids
    1263              :  *  * keep track of uncommitted AccessExclusiveLocks
    1264              :  *
    1265              :  * When we see a commit/abort we must remove known assigned xids and locks
    1266              :  * from the completing transaction. Attempted removals that cannot locate
    1267              :  * an entry are expected and must not cause an error when we are in state
    1268              :  * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
    1269              :  * KnownAssignedXidsRemove().
    1270              :  *
    1271              :  * Later, when we apply the running xact data we must be careful to ignore
    1272              :  * transactions already committed, since those commits raced ahead when
    1273              :  * making WAL entries.
    1274              :  *
    1275              :  * For logical decoding only the running xacts information is needed;
    1276              :  * there's no need to look at the locking information, but it's logged anyway,
    1277              :  * as there's no independent knob to just enable logical decoding. For
    1278              :  * details of how this is used, check snapbuild.c's introductory comment.
    1279              :  *
    1280              :  *
    1281              :  * Returns the RecPtr of the last inserted record.
    1282              :  */
    1283              : XLogRecPtr
    1284         1509 : LogStandbySnapshot(void)
    1285              : {
    1286              :     XLogRecPtr  recptr;
    1287              :     RunningTransactions running;
    1288              :     xl_standby_lock *locks;
    1289              :     int         nlocks;
    1290         1509 :     bool        logical_decoding_enabled = IsLogicalDecodingEnabled();
    1291              : 
    1292              :     Assert(XLogStandbyInfoActive());
    1293              : 
    1294              : #ifdef USE_INJECTION_POINTS
    1295         1509 :     if (IS_INJECTION_POINT_ATTACHED("skip-log-running-xacts"))
    1296              :     {
    1297              :         /*
    1298              :          * This record could move slot's xmin forward during decoding, leading
    1299              :          * to unpredictable results, so skip it when requested by the test.
    1300              :          */
    1301            0 :         return GetInsertRecPtr();
    1302              :     }
    1303              : #endif
    1304              : 
    1305              :     /*
    1306              :      * Get details of any AccessExclusiveLocks being held at the moment.
    1307              :      */
    1308         1509 :     locks = GetRunningTransactionLocks(&nlocks);
    1309         1509 :     if (nlocks > 0)
    1310          154 :         LogAccessExclusiveLocks(nlocks, locks);
    1311         1509 :     pfree(locks);
    1312              : 
    1313              :     /*
    1314              :      * Log details of all in-progress transactions. This should be the last
    1315              :      * record we write, because standby will open up when it sees this.
    1316              :      */
    1317         1509 :     running = GetRunningTransactionData();
    1318              : 
    1319              :     /*
    1320              :      * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
    1321              :      * For Hot Standby this can be done before inserting the WAL record
    1322              :      * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
    1323              :      * the clog. For logical decoding, though, the lock can't be released
    1324              :      * early because the clog might be "in the future" from the POV of the
    1325              :      * historic snapshot. This would allow for situations where we're waiting
    1326              :      * for the end of a transaction listed in the xl_running_xacts record
    1327              :      * which, according to the WAL, has committed before the xl_running_xacts
    1328              :      * record. Fortunately this routine isn't executed frequently, and it's
    1329              :      * only a shared lock.
    1330              :      */
    1331         1509 :     if (!logical_decoding_enabled)
    1332          950 :         LWLockRelease(ProcArrayLock);
    1333              : 
    1334         1509 :     recptr = LogCurrentRunningXacts(running);
    1335              : 
    1336              :     /* Release lock if we kept it longer ... */
    1337         1509 :     if (logical_decoding_enabled)
    1338          559 :         LWLockRelease(ProcArrayLock);
    1339              : 
    1340              :     /* GetRunningTransactionData() acquired XidGenLock, we must release it */
    1341         1509 :     LWLockRelease(XidGenLock);
    1342              : 
    1343         1509 :     return recptr;
    1344              : }
    1345              : 
    1346              : /*
    1347              :  * Record an enhanced snapshot of running transactions into WAL.
    1348              :  *
    1349              :  * The definitions of RunningTransactionsData and xl_running_xacts are
    1350              :  * similar. We keep them separate because xl_running_xacts is a contiguous
    1351              :  * chunk of memory and never exists fully until it is assembled in WAL.
    1352              :  * The inserted records are marked as not being important for durability,
    1353              :  * to avoid triggering superfluous checkpoint / archiving activity.
    1354              :  */
    1355              : static XLogRecPtr
    1356         1509 : LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
    1357              : {
    1358              :     xl_running_xacts xlrec;
    1359              :     XLogRecPtr  recptr;
    1360              : 
    1361         1509 :     xlrec.xcnt = CurrRunningXacts->xcnt;
    1362         1509 :     xlrec.subxcnt = CurrRunningXacts->subxcnt;
    1363         1509 :     xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY);
    1364         1509 :     xlrec.nextXid = CurrRunningXacts->nextXid;
    1365         1509 :     xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
    1366         1509 :     xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
    1367              : 
    1368              :     /* Header */
    1369         1509 :     XLogBeginInsert();
    1370         1509 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    1371         1509 :     XLogRegisterData(&xlrec, MinSizeOfXactRunningXacts);
    1372              : 
    1373              :     /* array of TransactionIds */
    1374         1509 :     if (xlrec.xcnt > 0)
    1375          494 :         XLogRegisterData(CurrRunningXacts->xids,
    1376          494 :                          (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
    1377              : 
    1378         1509 :     recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
    1379              : 
    1380         1509 :     if (xlrec.subxid_overflow)
    1381            1 :         elog(DEBUG2,
    1382              :              "snapshot of %d running transactions overflowed (lsn %X/%08X oldest xid %u latest complete %u next xid %u)",
    1383              :              CurrRunningXacts->xcnt,
    1384              :              LSN_FORMAT_ARGS(recptr),
    1385              :              CurrRunningXacts->oldestRunningXid,
    1386              :              CurrRunningXacts->latestCompletedXid,
    1387              :              CurrRunningXacts->nextXid);
    1388              :     else
    1389         1508 :         elog(DEBUG2,
    1390              :              "snapshot of %d+%d running transaction ids (lsn %X/%08X oldest xid %u latest complete %u next xid %u)",
    1391              :              CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
    1392              :              LSN_FORMAT_ARGS(recptr),
    1393              :              CurrRunningXacts->oldestRunningXid,
    1394              :              CurrRunningXacts->latestCompletedXid,
    1395              :              CurrRunningXacts->nextXid);
    1396              : 
    1397              :     /*
    1398              :      * Ensure running_xacts information is synced to disk not too far in the
    1399              :      * future. We don't want to stall anything though (i.e. use XLogFlush()),
    1400              :      * so we let the wal writer do it during normal operation.
    1401              :      * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
    1402              :      * and nudge the WALWriter into action if sleeping. Check
    1403              :      * XLogBackgroundFlush() for details why a record might not be flushed
    1404              :      * without it.
    1405              :      */
    1406         1509 :     XLogSetAsyncXactLSN(recptr);
    1407              : 
    1408         1509 :     return recptr;
    1409              : }
    1410              : 
    1411              : /*
    1412              :  * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
    1413              :  * logged, as described in backend/storage/lmgr/README.
    1414              :  */
    1415              : static void
    1416       142758 : LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
    1417              : {
    1418              :     xl_standby_locks xlrec;
    1419              : 
    1420       142758 :     xlrec.nlocks = nlocks;
    1421              : 
    1422       142758 :     XLogBeginInsert();
    1423       142758 :     XLogRegisterData(&xlrec, offsetof(xl_standby_locks, locks));
    1424       142758 :     XLogRegisterData(locks, nlocks * sizeof(xl_standby_lock));
    1425       142758 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    1426              : 
    1427       142758 :     (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
    1428       142758 : }
    1429              : 
    1430              : /*
    1431              :  * Individual logging of AccessExclusiveLocks for use during LockAcquire()
    1432              :  */
    1433              : void
    1434       142604 : LogAccessExclusiveLock(Oid dbOid, Oid relOid)
    1435              : {
    1436              :     xl_standby_lock xlrec;
    1437              : 
    1438       142604 :     xlrec.xid = GetCurrentTransactionId();
    1439              : 
    1440       142604 :     xlrec.dbOid = dbOid;
    1441       142604 :     xlrec.relOid = relOid;
    1442              : 
    1443       142604 :     LogAccessExclusiveLocks(1, &xlrec);
    1444       142604 :     MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
    1445       142604 : }
    1446              : 
    1447              : /*
    1448              :  * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
    1449              :  */
    1450              : void
    1451       142849 : LogAccessExclusiveLockPrepare(void)
    1452              : {
    1453              :     /*
    1454              :      * Ensure that a TransactionId has been assigned to this transaction, for
    1455              :      * two reasons, both related to lock release on the standby. First, we
    1456              :      * must assign an xid so that RecordTransactionCommit() and
    1457              :      * RecordTransactionAbort() do not optimise away the transaction
    1458              :      * completion record which recovery relies upon to release locks. It's a
    1459              :      * hack, but for a corner case not worth adding code for into the main
    1460              :      * commit path. Second, we must assign an xid before the lock is recorded
    1461              :      * in shared memory, otherwise a concurrently executing
    1462              :      * GetRunningTransactionLocks() might see a lock associated with an
    1463              :      * InvalidTransactionId which we later assert cannot happen.
    1464              :      */
    1465       142849 :     (void) GetCurrentTransactionId();
    1466       142849 : }
    1467              : 
    1468              : /*
    1469              :  * Emit WAL for invalidations. This currently is only used for commits without
    1470              :  * an xid but which contain invalidations.
    1471              :  */
    1472              : void
    1473        11285 : LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
    1474              :                         bool relcacheInitFileInval)
    1475              : {
    1476              :     xl_invalidations xlrec;
    1477              : 
    1478              :     /* prepare record */
    1479        11285 :     memset(&xlrec, 0, sizeof(xlrec));
    1480        11285 :     xlrec.dbId = MyDatabaseId;
    1481        11285 :     xlrec.tsId = MyDatabaseTableSpace;
    1482        11285 :     xlrec.relcacheInitFileInval = relcacheInitFileInval;
    1483        11285 :     xlrec.nmsgs = nmsgs;
    1484              : 
    1485              :     /* perform insertion */
    1486        11285 :     XLogBeginInsert();
    1487        11285 :     XLogRegisterData(&xlrec, MinSizeOfInvalidations);
    1488        11285 :     XLogRegisterData(msgs,
    1489              :                      nmsgs * sizeof(SharedInvalidationMessage));
    1490        11285 :     XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
    1491        11285 : }
    1492              : 
    1493              : /* Return the description of recovery conflict */
    1494              : static const char *
    1495           10 : get_recovery_conflict_desc(RecoveryConflictReason reason)
    1496              : {
    1497           10 :     const char *reasonDesc = _("unknown reason");
    1498              : 
    1499           10 :     switch (reason)
    1500              :     {
    1501            4 :         case RECOVERY_CONFLICT_BUFFERPIN:
    1502            4 :             reasonDesc = _("recovery conflict on buffer pin");
    1503            4 :             break;
    1504            2 :         case RECOVERY_CONFLICT_LOCK:
    1505            2 :             reasonDesc = _("recovery conflict on lock");
    1506            2 :             break;
    1507            2 :         case RECOVERY_CONFLICT_TABLESPACE:
    1508            2 :             reasonDesc = _("recovery conflict on tablespace");
    1509            2 :             break;
    1510            2 :         case RECOVERY_CONFLICT_SNAPSHOT:
    1511            2 :             reasonDesc = _("recovery conflict on snapshot");
    1512            2 :             break;
    1513            0 :         case RECOVERY_CONFLICT_LOGICALSLOT:
    1514            0 :             reasonDesc = _("recovery conflict on replication slot");
    1515            0 :             break;
    1516            0 :         case RECOVERY_CONFLICT_STARTUP_DEADLOCK:
    1517            0 :             reasonDesc = _("recovery conflict on deadlock");
    1518            0 :             break;
    1519            0 :         case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK:
    1520            0 :             reasonDesc = _("recovery conflict on buffer deadlock");
    1521            0 :             break;
    1522            0 :         case RECOVERY_CONFLICT_DATABASE:
    1523            0 :             reasonDesc = _("recovery conflict on database");
    1524            0 :             break;
    1525              :     }
    1526              : 
    1527           10 :     return reasonDesc;
    1528              : }
        

Generated by: LCOV version 2.0-1