LCOV - code coverage report
Current view: top level - src/backend/storage/ipc - standby.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 91.6 % 381 349
Test Date: 2026-03-17 07:15:15 Functions: 100.0 % 31 31
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * standby.c
       4              :  *    Misc functions used in Hot Standby mode.
       5              :  *
       6              :  *  All functions for handling RM_STANDBY_ID, which relate to
       7              :  *  AccessExclusiveLocks and starting snapshots for Hot Standby mode.
       8              :  *  Plus conflict recovery processing.
       9              :  *
      10              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      11              :  * Portions Copyright (c) 1994, Regents of the University of California
      12              :  *
      13              :  * IDENTIFICATION
      14              :  *    src/backend/storage/ipc/standby.c
      15              :  *
      16              :  *-------------------------------------------------------------------------
      17              :  */
      18              : #include "postgres.h"
      19              : #include "access/transam.h"
      20              : #include "access/twophase.h"
      21              : #include "access/xact.h"
      22              : #include "access/xloginsert.h"
      23              : #include "access/xlogrecovery.h"
      24              : #include "access/xlogutils.h"
      25              : #include "miscadmin.h"
      26              : #include "pgstat.h"
      27              : #include "replication/slot.h"
      28              : #include "storage/bufmgr.h"
      29              : #include "storage/proc.h"
      30              : #include "storage/procarray.h"
      31              : #include "storage/sinvaladt.h"
      32              : #include "storage/standby.h"
      33              : #include "utils/hsearch.h"
      34              : #include "utils/injection_point.h"
      35              : #include "utils/ps_status.h"
      36              : #include "utils/timeout.h"
      37              : #include "utils/timestamp.h"
      38              : #include "utils/wait_event.h"
      39              : 
      40              : /* User-settable GUC parameters */
      41              : int         max_standby_archive_delay = 30 * 1000;
      42              : int         max_standby_streaming_delay = 30 * 1000;
      43              : bool        log_recovery_conflict_waits = false;
      44              : 
      45              : /*
      46              :  * Keep track of all the exclusive locks owned by original transactions.
      47              :  * For each known exclusive lock, there is a RecoveryLockEntry in the
      48              :  * RecoveryLockHash hash table.  All RecoveryLockEntrys belonging to a
      49              :  * given XID are chained together so that we can find them easily.
      50              :  * For each original transaction that is known to have any such locks,
      51              :  * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
      52              :  * which stores the head of the chain of its locks.
      53              :  */
      54              : typedef struct RecoveryLockEntry
      55              : {
      56              :     xl_standby_lock key;        /* hash key: xid, dbOid, relOid */
      57              :     struct RecoveryLockEntry *next; /* chain link */
      58              : } RecoveryLockEntry;
      59              : 
      60              : typedef struct RecoveryLockXidEntry
      61              : {
      62              :     TransactionId xid;          /* hash key -- must be first */
      63              :     struct RecoveryLockEntry *head; /* chain head */
      64              : } RecoveryLockXidEntry;
      65              : 
      66              : static HTAB *RecoveryLockHash = NULL;
      67              : static HTAB *RecoveryLockXidHash = NULL;
      68              : 
      69              : /* Flags set by timeout handlers */
      70              : static volatile sig_atomic_t got_standby_deadlock_timeout = false;
      71              : static volatile sig_atomic_t got_standby_delay_timeout = false;
      72              : static volatile sig_atomic_t got_standby_lock_timeout = false;
      73              : 
      74              : static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
      75              :                                                    RecoveryConflictReason reason,
      76              :                                                    uint32 wait_event_info,
      77              :                                                    bool report_waiting);
      78              : static void SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason);
      79              : static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
      80              : static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
      81              : static const char *get_recovery_conflict_desc(RecoveryConflictReason reason);
      82              : 
      83              : /*
      84              :  * InitRecoveryTransactionEnvironment
      85              :  *      Initialize tracking of our primary's in-progress transactions.
      86              :  *
      87              :  * We need to issue shared invalidations and hold locks. Holding locks
      88              :  * means others may want to wait on us, so we need to make a lock table
      89              :  * vxact entry like a real transaction. We could create and delete
      90              :  * lock table entries for each transaction but its simpler just to create
      91              :  * one permanent entry and leave it there all the time. Locks are then
      92              :  * acquired and released as needed. Yes, this means you can see the
      93              :  * Startup process in pg_locks once we have run this.
      94              :  */
      95              : void
      96          115 : InitRecoveryTransactionEnvironment(void)
      97              : {
      98              :     VirtualTransactionId vxid;
      99              :     HASHCTL     hash_ctl;
     100              : 
     101              :     Assert(RecoveryLockHash == NULL);   /* don't run this twice */
     102              : 
     103              :     /*
     104              :      * Initialize the hash tables for tracking the locks held by each
     105              :      * transaction.
     106              :      */
     107          115 :     hash_ctl.keysize = sizeof(xl_standby_lock);
     108          115 :     hash_ctl.entrysize = sizeof(RecoveryLockEntry);
     109          115 :     RecoveryLockHash = hash_create("RecoveryLockHash",
     110              :                                    64,
     111              :                                    &hash_ctl,
     112              :                                    HASH_ELEM | HASH_BLOBS);
     113          115 :     hash_ctl.keysize = sizeof(TransactionId);
     114          115 :     hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
     115          115 :     RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
     116              :                                       64,
     117              :                                       &hash_ctl,
     118              :                                       HASH_ELEM | HASH_BLOBS);
     119              : 
     120              :     /*
     121              :      * Initialize shared invalidation management for Startup process, being
     122              :      * careful to register ourselves as a sendOnly process so we don't need to
     123              :      * read messages, nor will we get signaled when the queue starts filling
     124              :      * up.
     125              :      */
     126          115 :     SharedInvalBackendInit(true);
     127              : 
     128              :     /*
     129              :      * Lock a virtual transaction id for Startup process.
     130              :      *
     131              :      * We need to do GetNextLocalTransactionId() because
     132              :      * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
     133              :      * manager doesn't like that at all.
     134              :      *
     135              :      * Note that we don't need to run XactLockTableInsert() because nobody
     136              :      * needs to wait on xids. That sounds a little strange, but table locks
     137              :      * are held by vxids and row level locks are held by xids. All queries
     138              :      * hold AccessShareLocks so never block while we write or lock new rows.
     139              :      */
     140          115 :     MyProc->vxid.procNumber = MyProcNumber;
     141          115 :     vxid.procNumber = MyProcNumber;
     142          115 :     vxid.localTransactionId = GetNextLocalTransactionId();
     143          115 :     VirtualXactLockTableInsert(vxid);
     144              : 
     145          115 :     standbyState = STANDBY_INITIALIZED;
     146          115 : }
     147              : 
     148              : /*
     149              :  * ShutdownRecoveryTransactionEnvironment
     150              :  *      Shut down transaction tracking
     151              :  *
     152              :  * Prepare to switch from hot standby mode to normal operation. Shut down
     153              :  * recovery-time transaction tracking.
     154              :  *
     155              :  * This must be called even in shutdown of startup process if transaction
     156              :  * tracking has been initialized. Otherwise some locks the tracked
     157              :  * transactions were holding will not be released and may interfere with
     158              :  * the processes still running (but will exit soon later) at the exit of
     159              :  * startup process.
     160              :  */
     161              : void
     162          170 : ShutdownRecoveryTransactionEnvironment(void)
     163              : {
     164              :     /*
     165              :      * Do nothing if RecoveryLockHash is NULL because that means that
     166              :      * transaction tracking has not yet been initialized or has already been
     167              :      * shut down.  This makes it safe to have possibly-redundant calls of this
     168              :      * function during process exit.
     169              :      */
     170          170 :     if (RecoveryLockHash == NULL)
     171           55 :         return;
     172              : 
     173              :     /* Mark all tracked in-progress transactions as finished. */
     174          115 :     ExpireAllKnownAssignedTransactionIds();
     175              : 
     176              :     /* Release all locks the tracked transactions were holding */
     177          115 :     StandbyReleaseAllLocks();
     178              : 
     179              :     /* Destroy the lock hash tables. */
     180          115 :     hash_destroy(RecoveryLockHash);
     181          115 :     hash_destroy(RecoveryLockXidHash);
     182          115 :     RecoveryLockHash = NULL;
     183          115 :     RecoveryLockXidHash = NULL;
     184              : 
     185              :     /* Cleanup our VirtualTransaction */
     186          115 :     VirtualXactLockTableCleanup();
     187              : }
     188              : 
     189              : 
     190              : /*
     191              :  * -----------------------------------------------------
     192              :  *      Standby wait timers and backend cancel logic
     193              :  * -----------------------------------------------------
     194              :  */
     195              : 
     196              : /*
     197              :  * Determine the cutoff time at which we want to start canceling conflicting
     198              :  * transactions.  Returns zero (a time safely in the past) if we are willing
     199              :  * to wait forever.
     200              :  */
     201              : static TimestampTz
     202           28 : GetStandbyLimitTime(void)
     203              : {
     204              :     TimestampTz rtime;
     205              :     bool        fromStream;
     206              : 
     207              :     /*
     208              :      * The cutoff time is the last WAL data receipt time plus the appropriate
     209              :      * delay variable.  Delay of -1 means wait forever.
     210              :      */
     211           28 :     GetXLogReceiptTime(&rtime, &fromStream);
     212           28 :     if (fromStream)
     213              :     {
     214           28 :         if (max_standby_streaming_delay < 0)
     215            0 :             return 0;           /* wait forever */
     216           28 :         return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
     217              :     }
     218              :     else
     219              :     {
     220            0 :         if (max_standby_archive_delay < 0)
     221            0 :             return 0;           /* wait forever */
     222            0 :         return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
     223              :     }
     224              : }
     225              : 
     226              : #define STANDBY_INITIAL_WAIT_US  1000
     227              : static int  standbyWait_us = STANDBY_INITIAL_WAIT_US;
     228              : 
     229              : /*
     230              :  * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
     231              :  * We wait here for a while then return. If we decide we can't wait any
     232              :  * more then we return true, if we can wait some more return false.
     233              :  */
     234              : static bool
     235           16 : WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
     236              : {
     237              :     TimestampTz ltime;
     238              : 
     239           16 :     CHECK_FOR_INTERRUPTS();
     240              : 
     241              :     /* Are we past the limit time? */
     242           16 :     ltime = GetStandbyLimitTime();
     243           16 :     if (ltime && GetCurrentTimestamp() >= ltime)
     244            4 :         return true;
     245              : 
     246              :     /*
     247              :      * Sleep a bit (this is essential to avoid busy-waiting).
     248              :      */
     249           12 :     pgstat_report_wait_start(wait_event_info);
     250           12 :     pg_usleep(standbyWait_us);
     251           12 :     pgstat_report_wait_end();
     252              : 
     253              :     /*
     254              :      * Progressively increase the sleep times, but not to more than 1s, since
     255              :      * pg_usleep isn't interruptible on some platforms.
     256              :      */
     257           12 :     standbyWait_us *= 2;
     258           12 :     if (standbyWait_us > 1000000)
     259            0 :         standbyWait_us = 1000000;
     260              : 
     261           12 :     return false;
     262              : }
     263              : 
     264              : /*
     265              :  * Log the recovery conflict.
     266              :  *
     267              :  * wait_start is the timestamp when the caller started to wait.
     268              :  * now is the timestamp when this function has been called.
     269              :  * wait_list is the list of virtual transaction ids assigned to
     270              :  * conflicting processes. still_waiting indicates whether
     271              :  * the startup process is still waiting for the recovery conflict
     272              :  * to be resolved or not.
     273              :  */
     274              : void
     275           10 : LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start,
     276              :                     TimestampTz now, VirtualTransactionId *wait_list,
     277              :                     bool still_waiting)
     278              : {
     279              :     long        secs;
     280              :     int         usecs;
     281              :     long        msecs;
     282              :     StringInfoData buf;
     283           10 :     int         nprocs = 0;
     284              : 
     285              :     /*
     286              :      * There must be no conflicting processes when the recovery conflict has
     287              :      * already been resolved.
     288              :      */
     289              :     Assert(still_waiting || wait_list == NULL);
     290              : 
     291           10 :     TimestampDifference(wait_start, now, &secs, &usecs);
     292           10 :     msecs = secs * 1000 + usecs / 1000;
     293           10 :     usecs = usecs % 1000;
     294              : 
     295           10 :     if (wait_list)
     296              :     {
     297              :         VirtualTransactionId *vxids;
     298              : 
     299              :         /* Construct a string of list of the conflicting processes */
     300            3 :         vxids = wait_list;
     301            6 :         while (VirtualTransactionIdIsValid(*vxids))
     302              :         {
     303            3 :             PGPROC     *proc = ProcNumberGetProc(vxids->procNumber);
     304              : 
     305              :             /* proc can be NULL if the target backend is not active */
     306            3 :             if (proc)
     307              :             {
     308            3 :                 if (nprocs == 0)
     309              :                 {
     310            3 :                     initStringInfo(&buf);
     311            3 :                     appendStringInfo(&buf, "%d", proc->pid);
     312              :                 }
     313              :                 else
     314            0 :                     appendStringInfo(&buf, ", %d", proc->pid);
     315              : 
     316            3 :                 nprocs++;
     317              :             }
     318              : 
     319            3 :             vxids++;
     320              :         }
     321              :     }
     322              : 
     323              :     /*
     324              :      * If wait_list is specified, report the list of PIDs of active
     325              :      * conflicting backends in a detail message. Note that if all the backends
     326              :      * in the list are not active, no detail message is logged.
     327              :      */
     328           10 :     if (still_waiting)
     329              :     {
     330            5 :         ereport(LOG,
     331              :                 errmsg("recovery still waiting after %ld.%03d ms: %s",
     332              :                        msecs, usecs, get_recovery_conflict_desc(reason)),
     333              :                 nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
     334              :                                                   "Conflicting processes: %s.",
     335              :                                                   nprocs, buf.data) : 0);
     336              :     }
     337              :     else
     338              :     {
     339            5 :         ereport(LOG,
     340              :                 errmsg("recovery finished waiting after %ld.%03d ms: %s",
     341              :                        msecs, usecs, get_recovery_conflict_desc(reason)));
     342              :     }
     343              : 
     344           10 :     if (nprocs > 0)
     345            3 :         pfree(buf.data);
     346           10 : }
     347              : 
     348              : /*
     349              :  * This is the main executioner for any query backend that conflicts with
     350              :  * recovery processing. Judgement has already been passed on it within
     351              :  * a specific rmgr. Here we just issue the orders to the procs. The procs
     352              :  * then throw the required error as instructed.
     353              :  *
     354              :  * If report_waiting is true, "waiting" is reported in PS display and the
     355              :  * wait for recovery conflict is reported in the log, if necessary. If
     356              :  * the caller is responsible for reporting them, report_waiting should be
     357              :  * false. Otherwise, both the caller and this function report the same
     358              :  * thing unexpectedly.
     359              :  */
     360              : static void
     361        15044 : ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
     362              :                                        RecoveryConflictReason reason,
     363              :                                        uint32 wait_event_info,
     364              :                                        bool report_waiting)
     365              : {
     366        15044 :     TimestampTz waitStart = 0;
     367        15044 :     bool        waiting = false;
     368        15044 :     bool        logged_recovery_conflict = false;
     369              : 
     370              :     /* Fast exit, to avoid a kernel call if there's no work to be done. */
     371        15044 :     if (!VirtualTransactionIdIsValid(*waitlist))
     372        15041 :         return;
     373              : 
     374              :     /* Set the wait start timestamp for reporting */
     375            3 :     if (report_waiting && (log_recovery_conflict_waits || update_process_title))
     376            2 :         waitStart = GetCurrentTimestamp();
     377              : 
     378            6 :     while (VirtualTransactionIdIsValid(*waitlist))
     379              :     {
     380              :         /* reset standbyWait_us for each xact we wait for */
     381            3 :         standbyWait_us = STANDBY_INITIAL_WAIT_US;
     382              : 
     383              :         /* wait until the virtual xid is gone */
     384           19 :         while (!VirtualXactLock(*waitlist, false))
     385              :         {
     386              :             /* Is it time to kill it? */
     387           16 :             if (WaitExceedsMaxStandbyDelay(wait_event_info))
     388              :             {
     389              :                 bool        signaled;
     390              : 
     391              :                 /*
     392              :                  * Now find out who to throw out of the balloon.
     393              :                  */
     394              :                 Assert(VirtualTransactionIdIsValid(*waitlist));
     395            4 :                 signaled = SignalRecoveryConflictWithVirtualXID(*waitlist, reason);
     396              : 
     397              :                 /*
     398              :                  * Wait a little bit for it to die so that we avoid flooding
     399              :                  * an unresponsive backend when system is heavily loaded.
     400              :                  */
     401            4 :                 if (signaled)
     402            4 :                     pg_usleep(5000L);
     403              :             }
     404              : 
     405           16 :             if (waitStart != 0 && (!logged_recovery_conflict || !waiting))
     406              :             {
     407           15 :                 TimestampTz now = 0;
     408              :                 bool        maybe_log_conflict;
     409              :                 bool        maybe_update_title;
     410              : 
     411           15 :                 maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
     412           15 :                 maybe_update_title = (update_process_title && !waiting);
     413              : 
     414              :                 /* Get the current timestamp if not report yet */
     415           15 :                 if (maybe_log_conflict || maybe_update_title)
     416           15 :                     now = GetCurrentTimestamp();
     417              : 
     418              :                 /*
     419              :                  * Report via ps if we have been waiting for more than 500
     420              :                  * msec (should that be configurable?)
     421              :                  */
     422           30 :                 if (maybe_update_title &&
     423           15 :                     TimestampDifferenceExceeds(waitStart, now, 500))
     424              :                 {
     425            0 :                     set_ps_display_suffix("waiting");
     426            0 :                     waiting = true;
     427              :                 }
     428              : 
     429              :                 /*
     430              :                  * Emit the log message if the startup process is waiting
     431              :                  * longer than deadlock_timeout for recovery conflict.
     432              :                  */
     433           22 :                 if (maybe_log_conflict &&
     434            7 :                     TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
     435              :                 {
     436            2 :                     LogRecoveryConflict(reason, waitStart, now, waitlist, true);
     437            2 :                     logged_recovery_conflict = true;
     438              :                 }
     439              :             }
     440              :         }
     441              : 
     442              :         /* The virtual transaction is gone now, wait for the next one */
     443            3 :         waitlist++;
     444              :     }
     445              : 
     446              :     /*
     447              :      * Emit the log message if recovery conflict was resolved but the startup
     448              :      * process waited longer than deadlock_timeout for it.
     449              :      */
     450            3 :     if (logged_recovery_conflict)
     451            2 :         LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
     452              :                             NULL, false);
     453              : 
     454              :     /* reset ps display to remove the suffix if we added one */
     455            3 :     if (waiting)
     456            0 :         set_ps_display_remove_suffix();
     457              : 
     458              : }
     459              : 
     460              : /*
     461              :  * Generate whatever recovery conflicts are needed to eliminate snapshots that
     462              :  * might see XIDs <= snapshotConflictHorizon as still running.
     463              :  *
     464              :  * snapshotConflictHorizon cutoffs are our standard approach to generating
     465              :  * granular recovery conflicts.  Note that InvalidTransactionId values are
     466              :  * interpreted as "definitely don't need any conflicts" here, which is a
     467              :  * general convention that WAL records can (and often do) depend on.
     468              :  */
     469              : void
     470        18005 : ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
     471              :                                     bool isCatalogRel,
     472              :                                     RelFileLocator locator)
     473              : {
     474              :     VirtualTransactionId *backends;
     475              : 
     476              :     /*
     477              :      * If we get passed InvalidTransactionId then we do nothing (no conflict).
     478              :      *
     479              :      * This can happen when replaying already-applied WAL records after a
     480              :      * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
     481              :      * record that marks as frozen a page which was already all-visible.  It's
     482              :      * also quite common with records generated during index deletion
     483              :      * (original execution of the deletion can reason that a recovery conflict
     484              :      * which is sufficient for the deletion operation must take place before
     485              :      * replay of the deletion record itself).
     486              :      */
     487        18005 :     if (!TransactionIdIsValid(snapshotConflictHorizon))
     488         2963 :         return;
     489              : 
     490              :     Assert(TransactionIdIsNormal(snapshotConflictHorizon));
     491        15042 :     backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
     492              :                                          locator.dbOid);
     493        15042 :     ResolveRecoveryConflictWithVirtualXIDs(backends,
     494              :                                            RECOVERY_CONFLICT_SNAPSHOT,
     495              :                                            WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
     496              :                                            true);
     497              : 
     498              :     /*
     499              :      * Note that WaitExceedsMaxStandbyDelay() is not taken into account here
     500              :      * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That
     501              :      * seems OK, given that this kind of conflict should not normally be
     502              :      * reached, e.g. due to using a physical replication slot.
     503              :      */
     504        15042 :     if (IsLogicalDecodingEnabled() && isCatalogRel)
     505           17 :         InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid,
     506              :                                            snapshotConflictHorizon);
     507              : }
     508              : 
     509              : /*
     510              :  * Variant of ResolveRecoveryConflictWithSnapshot that works with
     511              :  * FullTransactionId values
     512              :  */
     513              : void
     514           75 : ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
     515              :                                            bool isCatalogRel,
     516              :                                            RelFileLocator locator)
     517              : {
     518              :     /*
     519              :      * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
     520              :      * so truncate the logged FullTransactionId.  If the logged value is very
     521              :      * old, so that XID wrap-around already happened on it, there can't be any
     522              :      * snapshots that still see it.
     523              :      */
     524           75 :     FullTransactionId nextXid = ReadNextFullTransactionId();
     525              :     uint64      diff;
     526              : 
     527           75 :     diff = U64FromFullTransactionId(nextXid) -
     528           75 :         U64FromFullTransactionId(snapshotConflictHorizon);
     529           75 :     if (diff < MaxTransactionId / 2)
     530              :     {
     531              :         TransactionId truncated;
     532              : 
     533           75 :         truncated = XidFromFullTransactionId(snapshotConflictHorizon);
     534           75 :         ResolveRecoveryConflictWithSnapshot(truncated,
     535              :                                             isCatalogRel,
     536              :                                             locator);
     537              :     }
     538           75 : }
     539              : 
     540              : void
     541            1 : ResolveRecoveryConflictWithTablespace(Oid tsid)
     542              : {
     543              :     VirtualTransactionId *temp_file_users;
     544              : 
     545              :     /*
     546              :      * Standby users may be currently using this tablespace for their
     547              :      * temporary files. We only care about current users because
     548              :      * temp_tablespace parameter will just ignore tablespaces that no longer
     549              :      * exist.
     550              :      *
     551              :      * Ask everybody to cancel their queries immediately so we can ensure no
     552              :      * temp files remain and we can remove the tablespace. Nuke the entire
     553              :      * site from orbit, it's the only way to be sure.
     554              :      *
     555              :      * XXX: We could work out the pids of active backends using this
     556              :      * tablespace by examining the temp filenames in the directory. We would
     557              :      * then convert the pids into VirtualXIDs before attempting to cancel
     558              :      * them.
     559              :      *
     560              :      * We don't wait for commit because drop tablespace is non-transactional.
     561              :      */
     562            1 :     temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
     563              :                                                 InvalidOid);
     564            1 :     ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
     565              :                                            RECOVERY_CONFLICT_TABLESPACE,
     566              :                                            WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
     567              :                                            true);
     568            1 : }
     569              : 
     570              : void
     571           14 : ResolveRecoveryConflictWithDatabase(Oid dbid)
     572              : {
     573              :     /*
     574              :      * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
     575              :      * only waits for transactions and completely idle sessions would block
     576              :      * us. This is rare enough that we do this as simply as possible: no wait,
     577              :      * just force them off immediately.
     578              :      *
     579              :      * No locking is required here because we already acquired
     580              :      * AccessExclusiveLock. Anybody trying to connect while we do this will
     581              :      * block during InitPostgres() and then disconnect when they see the
     582              :      * database has been removed.
     583              :      */
     584           16 :     while (CountDBBackends(dbid) > 0)
     585              :     {
     586            2 :         SignalRecoveryConflictWithDatabase(dbid, RECOVERY_CONFLICT_DATABASE);
     587              : 
     588              :         /*
     589              :          * Wait awhile for them to die so that we avoid flooding an
     590              :          * unresponsive backend when system is heavily loaded.
     591              :          */
     592            2 :         pg_usleep(10000);
     593              :     }
     594           14 : }
     595              : 
     596              : /*
     597              :  * ResolveRecoveryConflictWithLock is called from ProcSleep()
     598              :  * to resolve conflicts with other backends holding relation locks.
     599              :  *
     600              :  * The WaitLatch sleep normally done in ProcSleep()
     601              :  * (when not InHotStandby) is performed here, for code clarity.
     602              :  *
     603              :  * We either resolve conflicts immediately or set a timeout to wake us at
     604              :  * the limit of our patience.
     605              :  *
     606              :  * Resolve conflicts by canceling to all backends holding a conflicting
     607              :  * lock.  As we are already queued to be granted the lock, no new lock
     608              :  * requests conflicting with ours will be granted in the meantime.
     609              :  *
     610              :  * We also must check for deadlocks involving the Startup process and
     611              :  * hot-standby backend processes. If deadlock_timeout is reached in
     612              :  * this function, all the backends holding the conflicting locks are
     613              :  * requested to check themselves for deadlocks.
     614              :  *
     615              :  * logging_conflict should be true if the recovery conflict has not been
     616              :  * logged yet even though logging is enabled. After deadlock_timeout is
     617              :  * reached and the request for deadlock check is sent, we wait again to
     618              :  * be signaled by the release of the lock if logging_conflict is false.
     619              :  * Otherwise we return without waiting again so that the caller can report
     620              :  * the recovery conflict. In this case, then, this function is called again
     621              :  * with logging_conflict=false (because the recovery conflict has already
     622              :  * been logged) and we will wait again for the lock to be released.
     623              :  */
     624              : void
     625            3 : ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
     626              : {
     627              :     TimestampTz ltime;
     628              :     TimestampTz now;
     629              : 
     630              :     Assert(InHotStandby);
     631              : 
     632            3 :     ltime = GetStandbyLimitTime();
     633            3 :     now = GetCurrentTimestamp();
     634              : 
     635              :     /*
     636              :      * Update waitStart if first time through after the startup process
     637              :      * started waiting for the lock. It should not be updated every time
     638              :      * ResolveRecoveryConflictWithLock() is called during the wait.
     639              :      *
     640              :      * Use the current time obtained for comparison with ltime as waitStart
     641              :      * (i.e., the time when this process started waiting for the lock). Since
     642              :      * getting the current time newly can cause overhead, we reuse the
     643              :      * already-obtained time to avoid that overhead.
     644              :      *
     645              :      * Note that waitStart is updated without holding the lock table's
     646              :      * partition lock, to avoid the overhead by additional lock acquisition.
     647              :      * This can cause "waitstart" in pg_locks to become NULL for a very short
     648              :      * period of time after the wait started even though "granted" is false.
     649              :      * This is OK in practice because we can assume that users are likely to
     650              :      * look at "waitstart" when waiting for the lock for a long time.
     651              :      */
     652            3 :     if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
     653            1 :         pg_atomic_write_u64(&MyProc->waitStart, now);
     654              : 
     655            3 :     if (now >= ltime && ltime != 0)
     656            1 :     {
     657              :         /*
     658              :          * We're already behind, so clear a path as quickly as possible.
     659              :          */
     660              :         VirtualTransactionId *backends;
     661              : 
     662            1 :         backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
     663              : 
     664              :         /*
     665              :          * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
     666              :          * "waiting" in PS display by disabling its argument report_waiting
     667              :          * because the caller, WaitOnLock(), has already reported that.
     668              :          */
     669            1 :         ResolveRecoveryConflictWithVirtualXIDs(backends,
     670              :                                                RECOVERY_CONFLICT_LOCK,
     671            1 :                                                PG_WAIT_LOCK | locktag.locktag_type,
     672              :                                                false);
     673              :     }
     674              :     else
     675              :     {
     676              :         /*
     677              :          * Wait (or wait again) until ltime, and check for deadlocks as well
     678              :          * if we will be waiting longer than deadlock_timeout
     679              :          */
     680              :         EnableTimeoutParams timeouts[2];
     681            2 :         int         cnt = 0;
     682              : 
     683            2 :         if (ltime != 0)
     684              :         {
     685            2 :             got_standby_lock_timeout = false;
     686            2 :             timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
     687            2 :             timeouts[cnt].type = TMPARAM_AT;
     688            2 :             timeouts[cnt].fin_time = ltime;
     689            2 :             cnt++;
     690              :         }
     691              : 
     692            2 :         got_standby_deadlock_timeout = false;
     693            2 :         timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
     694            2 :         timeouts[cnt].type = TMPARAM_AFTER;
     695            2 :         timeouts[cnt].delay_ms = DeadlockTimeout;
     696            2 :         cnt++;
     697              : 
     698            2 :         enable_timeouts(timeouts, cnt);
     699              :     }
     700              : 
     701              :     /* Wait to be signaled by the release of the Relation Lock */
     702            3 :     ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
     703              : 
     704              :     /*
     705              :      * Exit if ltime is reached. Then all the backends holding conflicting
     706              :      * locks will be canceled in the next ResolveRecoveryConflictWithLock()
     707              :      * call.
     708              :      */
     709            3 :     if (got_standby_lock_timeout)
     710            0 :         goto cleanup;
     711              : 
     712            3 :     if (got_standby_deadlock_timeout)
     713              :     {
     714              :         VirtualTransactionId *backends;
     715              : 
     716            2 :         backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
     717              : 
     718              :         /* Quick exit if there's no work to be done */
     719            2 :         if (!VirtualTransactionIdIsValid(*backends))
     720            0 :             goto cleanup;
     721              : 
     722              :         /*
     723              :          * Send signals to all the backends holding the conflicting locks, to
     724              :          * ask them to check themselves for deadlocks.
     725              :          */
     726            4 :         while (VirtualTransactionIdIsValid(*backends))
     727              :         {
     728            2 :             (void) SignalRecoveryConflictWithVirtualXID(*backends,
     729              :                                                         RECOVERY_CONFLICT_STARTUP_DEADLOCK);
     730            2 :             backends++;
     731              :         }
     732              : 
     733              :         /*
     734              :          * Exit if the recovery conflict has not been logged yet even though
     735              :          * logging is enabled, so that the caller can log that. Then
     736              :          * RecoveryConflictWithLock() is called again and we will wait again
     737              :          * for the lock to be released.
     738              :          */
     739            2 :         if (logging_conflict)
     740            1 :             goto cleanup;
     741              : 
     742              :         /*
     743              :          * Wait again here to be signaled by the release of the Relation Lock,
     744              :          * to prevent the subsequent RecoveryConflictWithLock() from causing
     745              :          * deadlock_timeout and sending a request for deadlocks check again.
     746              :          * Otherwise the request continues to be sent every deadlock_timeout
     747              :          * until the relation locks are released or ltime is reached.
     748              :          */
     749            1 :         got_standby_deadlock_timeout = false;
     750            1 :         ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
     751              :     }
     752              : 
     753            1 : cleanup:
     754              : 
     755              :     /*
     756              :      * Clear any timeout requests established above.  We assume here that the
     757              :      * Startup process doesn't have any other outstanding timeouts than those
     758              :      * used by this function. If that stops being true, we could cancel the
     759              :      * timeouts individually, but that'd be slower.
     760              :      */
     761            3 :     disable_all_timeouts(false);
     762            3 :     got_standby_lock_timeout = false;
     763            3 :     got_standby_deadlock_timeout = false;
     764            3 : }
     765              : 
     766              : /*
     767              :  * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
     768              :  * to resolve conflicts with other backends holding buffer pins.
     769              :  *
     770              :  * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
     771              :  * (when not InHotStandby) is performed here, for code clarity.
     772              :  *
     773              :  * We either resolve conflicts immediately or set a timeout to wake us at
     774              :  * the limit of our patience.
     775              :  *
     776              :  * Resolve conflicts by sending a PROCSIG signal to all backends to check if
     777              :  * they hold one of the buffer pins that is blocking Startup process. If so,
     778              :  * those backends will take an appropriate error action, ERROR or FATAL.
     779              :  *
     780              :  * We also must check for deadlocks.  Deadlocks occur because if queries
     781              :  * wait on a lock, that must be behind an AccessExclusiveLock, which can only
     782              :  * be cleared if the Startup process replays a transaction completion record.
     783              :  * If Startup process is also waiting then that is a deadlock. The deadlock
     784              :  * can occur if the query is waiting and then the Startup sleeps, or if
     785              :  * Startup is sleeping and the query waits on a lock. We protect against
     786              :  * only the former sequence here, the latter sequence is checked prior to
     787              :  * the query sleeping, in CheckRecoveryConflictDeadlock().
     788              :  *
     789              :  * Deadlocks are extremely rare, and relatively expensive to check for,
     790              :  * so we don't do a deadlock check right away ... only if we have had to wait
     791              :  * at least deadlock_timeout.
     792              :  */
     793              : void
     794            9 : ResolveRecoveryConflictWithBufferPin(void)
     795              : {
     796              :     TimestampTz ltime;
     797              : 
     798              :     Assert(InHotStandby);
     799              : 
     800            9 :     ltime = GetStandbyLimitTime();
     801              : 
     802            9 :     if (GetCurrentTimestamp() >= ltime && ltime != 0)
     803              :     {
     804              :         /*
     805              :          * We're already behind, so clear a path as quickly as possible.
     806              :          */
     807            1 :         SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
     808              :     }
     809              :     else
     810              :     {
     811              :         /*
     812              :          * Wake up at ltime, and check for deadlocks as well if we will be
     813              :          * waiting longer than deadlock_timeout
     814              :          */
     815              :         EnableTimeoutParams timeouts[2];
     816            8 :         int         cnt = 0;
     817              : 
     818            8 :         if (ltime != 0)
     819              :         {
     820            8 :             timeouts[cnt].id = STANDBY_TIMEOUT;
     821            8 :             timeouts[cnt].type = TMPARAM_AT;
     822            8 :             timeouts[cnt].fin_time = ltime;
     823            8 :             cnt++;
     824              :         }
     825              : 
     826            8 :         got_standby_deadlock_timeout = false;
     827            8 :         timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
     828            8 :         timeouts[cnt].type = TMPARAM_AFTER;
     829            8 :         timeouts[cnt].delay_ms = DeadlockTimeout;
     830            8 :         cnt++;
     831              : 
     832            8 :         enable_timeouts(timeouts, cnt);
     833              :     }
     834              : 
     835              :     /*
     836              :      * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
     837              :      * by one of the timeouts established above.
     838              :      *
     839              :      * We assume that only UnpinBuffer() and the timeout requests established
     840              :      * above can wake us up here. WakeupRecovery() called by walreceiver or
     841              :      * SIGHUP signal handler, etc cannot do that because it uses the different
     842              :      * latch from that ProcWaitForSignal() waits on.
     843              :      */
     844            9 :     ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
     845              : 
     846            9 :     if (got_standby_delay_timeout)
     847            1 :         SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
     848            8 :     else if (got_standby_deadlock_timeout)
     849              :     {
     850              :         /*
     851              :          * Send out a request for hot-standby backends to check themselves for
     852              :          * deadlocks.
     853              :          *
     854              :          * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
     855              :          * to be signaled by UnpinBuffer() again and send a request for
     856              :          * deadlocks check if deadlock_timeout happens. This causes the
     857              :          * request to continue to be sent every deadlock_timeout until the
     858              :          * buffer is unpinned or ltime is reached. This would increase the
     859              :          * workload in the startup process and backends. In practice it may
     860              :          * not be so harmful because the period that the buffer is kept pinned
     861              :          * is basically no so long. But we should fix this?
     862              :          */
     863            5 :         SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
     864              :     }
     865              : 
     866              :     /*
     867              :      * Clear any timeout requests established above.  We assume here that the
     868              :      * Startup process doesn't have any other timeouts than what this function
     869              :      * uses.  If that stops being true, we could cancel the timeouts
     870              :      * individually, but that'd be slower.
     871              :      */
     872            9 :     disable_all_timeouts(false);
     873            9 :     got_standby_delay_timeout = false;
     874            9 :     got_standby_deadlock_timeout = false;
     875            9 : }
     876              : 
     877              : static void
     878            7 : SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason)
     879              : {
     880              :     Assert(reason == RECOVERY_CONFLICT_BUFFERPIN ||
     881              :            reason == RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
     882              : 
     883              :     /*
     884              :      * We send signal to all backends to ask them if they are holding the
     885              :      * buffer pin which is delaying the Startup process. Most of them will be
     886              :      * innocent, but we let the SIGUSR1 handling in each backend decide their
     887              :      * own fate.
     888              :      */
     889            7 :     SignalRecoveryConflictWithDatabase(InvalidOid, reason);
     890            7 : }
     891              : 
     892              : /*
     893              :  * In Hot Standby perform early deadlock detection.  We abort the lock
     894              :  * wait if we are about to sleep while holding the buffer pin that Startup
     895              :  * process is waiting for.
     896              :  *
     897              :  * Note: this code is pessimistic, because there is no way for it to
     898              :  * determine whether an actual deadlock condition is present: the lock we
     899              :  * need to wait for might be unrelated to any held by the Startup process.
     900              :  * Sooner or later, this mechanism should get ripped out in favor of somehow
     901              :  * accounting for buffer locks in DeadLockCheck().  However, errors here
     902              :  * seem to be very low-probability in practice, so for now it's not worth
     903              :  * the trouble.
     904              :  */
     905              : void
     906            1 : CheckRecoveryConflictDeadlock(void)
     907              : {
     908              :     Assert(!InRecovery);        /* do not call in Startup process */
     909              : 
     910            1 :     if (!HoldingBufferPinThatDelaysRecovery())
     911            1 :         return;
     912              : 
     913              :     /*
     914              :      * Error message should match ProcessInterrupts() but we avoid calling
     915              :      * that because we aren't handling an interrupt at this point. Note that
     916              :      * we only cancel the current transaction here, so if we are in a
     917              :      * subtransaction and the pin is held by a parent, then the Startup
     918              :      * process will continue to wait even though we have avoided deadlock.
     919              :      */
     920            0 :     ereport(ERROR,
     921              :             (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
     922              :              errmsg("canceling statement due to conflict with recovery"),
     923              :              errdetail("User transaction caused buffer deadlock with recovery.")));
     924              : }
     925              : 
     926              : 
     927              : /* --------------------------------
     928              :  *      timeout handler routines
     929              :  * --------------------------------
     930              :  */
     931              : 
     932              : /*
     933              :  * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
     934              :  * exceeded.
     935              :  */
     936              : void
     937            7 : StandbyDeadLockHandler(void)
     938              : {
     939            7 :     got_standby_deadlock_timeout = true;
     940            7 : }
     941              : 
     942              : /*
     943              :  * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
     944              :  */
     945              : void
     946            1 : StandbyTimeoutHandler(void)
     947              : {
     948            1 :     got_standby_delay_timeout = true;
     949            1 : }
     950              : 
     951              : /*
     952              :  * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
     953              :  */
     954              : void
     955            1 : StandbyLockTimeoutHandler(void)
     956              : {
     957            1 :     got_standby_lock_timeout = true;
     958            1 : }
     959              : 
     960              : /*
     961              :  * -----------------------------------------------------
     962              :  * Locking in Recovery Mode
     963              :  * -----------------------------------------------------
     964              :  *
     965              :  * All locks are held by the Startup process using a single virtual
     966              :  * transaction. This implementation is both simpler and in some senses,
     967              :  * more correct. The locks held mean "some original transaction held
     968              :  * this lock, so query access is not allowed at this time". So the Startup
     969              :  * process is the proxy by which the original locks are implemented.
     970              :  *
     971              :  * We only keep track of AccessExclusiveLocks, which are only ever held by
     972              :  * one transaction on one relation.
     973              :  *
     974              :  * We keep a table of known locks in the RecoveryLockHash hash table.
     975              :  * The point of that table is to let us efficiently de-duplicate locks,
     976              :  * which is important because checkpoints will re-report the same locks
     977              :  * already held.  There is also a RecoveryLockXidHash table with one entry
     978              :  * per xid, which allows us to efficiently find all the locks held by a
     979              :  * given original transaction.
     980              :  *
     981              :  * We use session locks rather than normal locks so we don't need
     982              :  * ResourceOwners.
     983              :  */
     984              : 
     985              : 
     986              : void
     987        27567 : StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
     988              : {
     989              :     RecoveryLockXidEntry *xidentry;
     990              :     RecoveryLockEntry *lockentry;
     991              :     xl_standby_lock key;
     992              :     LOCKTAG     locktag;
     993              :     bool        found;
     994              : 
     995              :     /* Already processed? */
     996        55134 :     if (!TransactionIdIsValid(xid) ||
     997        55119 :         TransactionIdDidCommit(xid) ||
     998        27552 :         TransactionIdDidAbort(xid))
     999           15 :         return;
    1000              : 
    1001        27552 :     elog(DEBUG4, "adding recovery lock: db %u rel %u", dbOid, relOid);
    1002              : 
    1003              :     /* dbOid is InvalidOid when we are locking a shared relation. */
    1004              :     Assert(OidIsValid(relOid));
    1005              : 
    1006              :     /* Create a hash entry for this xid, if we don't have one already. */
    1007        27552 :     xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
    1008        27552 :     if (!found)
    1009              :     {
    1010              :         Assert(xidentry->xid == xid);    /* dynahash should have set this */
    1011        11344 :         xidentry->head = NULL;
    1012              :     }
    1013              : 
    1014              :     /* Create a hash entry for this lock, unless we have one already. */
    1015        27552 :     key.xid = xid;
    1016        27552 :     key.dbOid = dbOid;
    1017        27552 :     key.relOid = relOid;
    1018        27552 :     lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
    1019        27552 :     if (!found)
    1020              :     {
    1021              :         /* It's new, so link it into the XID's list ... */
    1022        26471 :         lockentry->next = xidentry->head;
    1023        26471 :         xidentry->head = lockentry;
    1024              : 
    1025              :         /* ... and acquire the lock locally. */
    1026        26471 :         SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
    1027              : 
    1028        26471 :         (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
    1029              :     }
    1030              : }
    1031              : 
    1032              : /*
    1033              :  * Release all the locks associated with this RecoveryLockXidEntry.
    1034              :  */
    1035              : static void
    1036        11344 : StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
    1037              : {
    1038              :     RecoveryLockEntry *entry;
    1039              :     RecoveryLockEntry *next;
    1040              : 
    1041        37815 :     for (entry = xidentry->head; entry != NULL; entry = next)
    1042              :     {
    1043              :         LOCKTAG     locktag;
    1044              : 
    1045        26471 :         elog(DEBUG4,
    1046              :              "releasing recovery lock: xid %u db %u rel %u",
    1047              :              entry->key.xid, entry->key.dbOid, entry->key.relOid);
    1048              :         /* Release the lock ... */
    1049        26471 :         SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
    1050        26471 :         if (!LockRelease(&locktag, AccessExclusiveLock, true))
    1051              :         {
    1052            0 :             elog(LOG,
    1053              :                  "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
    1054              :                  entry->key.xid, entry->key.dbOid, entry->key.relOid);
    1055              :             Assert(false);
    1056              :         }
    1057              :         /* ... and remove the per-lock hash entry */
    1058        26471 :         next = entry->next;
    1059        26471 :         hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
    1060              :     }
    1061              : 
    1062        11344 :     xidentry->head = NULL;       /* just for paranoia */
    1063        11344 : }
    1064              : 
    1065              : /*
    1066              :  * Release locks for specific XID, or all locks if it's InvalidXid.
    1067              :  */
    1068              : static void
    1069        12024 : StandbyReleaseLocks(TransactionId xid)
    1070              : {
    1071              :     RecoveryLockXidEntry *entry;
    1072              : 
    1073        12024 :     if (TransactionIdIsValid(xid))
    1074              :     {
    1075        12024 :         if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
    1076              :         {
    1077        11344 :             StandbyReleaseXidEntryLocks(entry);
    1078        11344 :             hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1079              :         }
    1080              :     }
    1081              :     else
    1082            0 :         StandbyReleaseAllLocks();
    1083        12024 : }
    1084              : 
    1085              : /*
    1086              :  * Release locks for a transaction tree, starting at xid down, from
    1087              :  * RecoveryLockXidHash.
    1088              :  *
    1089              :  * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
    1090              :  * to remove any AccessExclusiveLocks requested by a transaction.
    1091              :  */
    1092              : void
    1093        11524 : StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
    1094              : {
    1095              :     int         i;
    1096              : 
    1097        11524 :     StandbyReleaseLocks(xid);
    1098              : 
    1099        12024 :     for (i = 0; i < nsubxids; i++)
    1100          500 :         StandbyReleaseLocks(subxids[i]);
    1101        11524 : }
    1102              : 
    1103              : /*
    1104              :  * Called at end of recovery and when we see a shutdown checkpoint.
    1105              :  */
    1106              : void
    1107          115 : StandbyReleaseAllLocks(void)
    1108              : {
    1109              :     HASH_SEQ_STATUS status;
    1110              :     RecoveryLockXidEntry *entry;
    1111              : 
    1112          115 :     elog(DEBUG2, "release all standby locks");
    1113              : 
    1114          115 :     hash_seq_init(&status, RecoveryLockXidHash);
    1115          115 :     while ((entry = hash_seq_search(&status)))
    1116              :     {
    1117            0 :         StandbyReleaseXidEntryLocks(entry);
    1118            0 :         hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1119              :     }
    1120          115 : }
    1121              : 
    1122              : /*
    1123              :  * StandbyReleaseOldLocks
    1124              :  *      Release standby locks held by top-level XIDs that aren't running,
    1125              :  *      as long as they're not prepared transactions.
    1126              :  *
    1127              :  * This is needed to prune the locks of crashed transactions, which didn't
    1128              :  * write an ABORT/COMMIT record.
    1129              :  */
    1130              : void
    1131          834 : StandbyReleaseOldLocks(TransactionId oldxid)
    1132              : {
    1133              :     HASH_SEQ_STATUS status;
    1134              :     RecoveryLockXidEntry *entry;
    1135              : 
    1136          834 :     hash_seq_init(&status, RecoveryLockXidHash);
    1137         1135 :     while ((entry = hash_seq_search(&status)))
    1138              :     {
    1139              :         Assert(TransactionIdIsValid(entry->xid));
    1140              : 
    1141              :         /* Skip if prepared transaction. */
    1142          301 :         if (StandbyTransactionIdIsPrepared(entry->xid))
    1143            0 :             continue;
    1144              : 
    1145              :         /* Skip if >= oldxid. */
    1146          301 :         if (!TransactionIdPrecedes(entry->xid, oldxid))
    1147          301 :             continue;
    1148              : 
    1149              :         /* Remove all locks and hash table entry. */
    1150            0 :         StandbyReleaseXidEntryLocks(entry);
    1151            0 :         hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
    1152              :     }
    1153          834 : }
    1154              : 
    1155              : /*
    1156              :  * --------------------------------------------------------------------
    1157              :  *      Recovery handling for Rmgr RM_STANDBY_ID
    1158              :  *
    1159              :  * These record types will only be created if XLogStandbyInfoActive()
    1160              :  * --------------------------------------------------------------------
    1161              :  */
    1162              : 
    1163              : void
    1164        28241 : standby_redo(XLogReaderState *record)
    1165              : {
    1166        28241 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    1167              : 
    1168              :     /* Backup blocks are not used in standby records */
    1169              :     Assert(!XLogRecHasAnyBlockRefs(record));
    1170              : 
    1171              :     /* Do nothing if we're not in hot standby mode */
    1172        28241 :     if (standbyState == STANDBY_DISABLED)
    1173          154 :         return;
    1174              : 
    1175        28087 :     if (info == XLOG_STANDBY_LOCK)
    1176              :     {
    1177        26643 :         xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
    1178              :         int         i;
    1179              : 
    1180        54210 :         for (i = 0; i < xlrec->nlocks; i++)
    1181        27567 :             StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
    1182              :                                               xlrec->locks[i].dbOid,
    1183              :                                               xlrec->locks[i].relOid);
    1184              :     }
    1185         1444 :     else if (info == XLOG_RUNNING_XACTS)
    1186              :     {
    1187          768 :         xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
    1188              :         RunningTransactionsData running;
    1189              : 
    1190          768 :         running.xcnt = xlrec->xcnt;
    1191          768 :         running.subxcnt = xlrec->subxcnt;
    1192          768 :         running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
    1193          768 :         running.nextXid = xlrec->nextXid;
    1194          768 :         running.latestCompletedXid = xlrec->latestCompletedXid;
    1195          768 :         running.oldestRunningXid = xlrec->oldestRunningXid;
    1196          768 :         running.xids = xlrec->xids;
    1197              : 
    1198          768 :         ProcArrayApplyRecoveryInfo(&running);
    1199              : 
    1200              :         /*
    1201              :          * The startup process currently has no convenient way to schedule
    1202              :          * stats to be reported. XLOG_RUNNING_XACTS records issued at a
    1203              :          * regular cadence, making this a convenient location to report stats.
    1204              :          * While these records aren't generated with wal_level=minimal, stats
    1205              :          * also cannot be accessed during WAL replay.
    1206              :          */
    1207          768 :         pgstat_report_stat(true);
    1208              :     }
    1209          676 :     else if (info == XLOG_INVALIDATIONS)
    1210              :     {
    1211          676 :         xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
    1212              : 
    1213          676 :         ProcessCommittedInvalidationMessages(xlrec->msgs,
    1214              :                                              xlrec->nmsgs,
    1215          676 :                                              xlrec->relcacheInitFileInval,
    1216              :                                              xlrec->dbId,
    1217              :                                              xlrec->tsId);
    1218              :     }
    1219              :     else
    1220            0 :         elog(PANIC, "standby_redo: unknown op code %u", info);
    1221              : }
    1222              : 
    1223              : /*
    1224              :  * Log details of the current snapshot to WAL. This allows the snapshot state
    1225              :  * to be reconstructed on the standby and for logical decoding.
    1226              :  *
    1227              :  * This is used for Hot Standby as follows:
    1228              :  *
    1229              :  * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
    1230              :  * start from a shutdown checkpoint because we know nothing was running
    1231              :  * at that time and our recovery snapshot is known empty. In the more
    1232              :  * typical case of an online checkpoint we need to jump through a few
    1233              :  * hoops to get a correct recovery snapshot and this requires a two or
    1234              :  * sometimes a three stage process.
    1235              :  *
    1236              :  * The initial snapshot must contain all running xids and all current
    1237              :  * AccessExclusiveLocks at a point in time on the standby. Assembling
    1238              :  * that information while the server is running requires many and
    1239              :  * various LWLocks, so we choose to derive that information piece by
    1240              :  * piece and then re-assemble that info on the standby. When that
    1241              :  * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
    1242              :  *
    1243              :  * Since locking on the primary when we derive the information is not
    1244              :  * strict, we note that there is a time window between the derivation and
    1245              :  * writing to WAL of the derived information. That allows race conditions
    1246              :  * that we must resolve, since xids and locks may enter or leave the
    1247              :  * snapshot during that window. This creates the issue that an xid or
    1248              :  * lock may start *after* the snapshot has been derived yet *before* the
    1249              :  * snapshot is logged in the running xacts WAL record. We resolve this by
    1250              :  * starting to accumulate changes at a point just prior to when we derive
    1251              :  * the snapshot on the primary, then ignore duplicates when we later apply
    1252              :  * the snapshot from the running xacts record. This is implemented during
    1253              :  * CreateCheckPoint() where we use the logical checkpoint location as
    1254              :  * our starting point and then write the running xacts record immediately
    1255              :  * before writing the main checkpoint WAL record. Since we always start
    1256              :  * up from a checkpoint and are immediately at our starting point, we
    1257              :  * unconditionally move to STANDBY_INITIALIZED. After this point we
    1258              :  * must do 4 things:
    1259              :  *  * move shared nextXid forwards as we see new xids
    1260              :  *  * extend the clog and subtrans with each new xid
    1261              :  *  * keep track of uncommitted known assigned xids
    1262              :  *  * keep track of uncommitted AccessExclusiveLocks
    1263              :  *
    1264              :  * When we see a commit/abort we must remove known assigned xids and locks
    1265              :  * from the completing transaction. Attempted removals that cannot locate
    1266              :  * an entry are expected and must not cause an error when we are in state
    1267              :  * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
    1268              :  * KnownAssignedXidsRemove().
    1269              :  *
    1270              :  * Later, when we apply the running xact data we must be careful to ignore
    1271              :  * transactions already committed, since those commits raced ahead when
    1272              :  * making WAL entries.
    1273              :  *
    1274              :  * For logical decoding only the running xacts information is needed;
    1275              :  * there's no need to look at the locking information, but it's logged anyway,
    1276              :  * as there's no independent knob to just enable logical decoding. For
    1277              :  * details of how this is used, check snapbuild.c's introductory comment.
    1278              :  *
    1279              :  *
    1280              :  * Returns the RecPtr of the last inserted record.
    1281              :  */
    1282              : XLogRecPtr
    1283         1461 : LogStandbySnapshot(void)
    1284              : {
    1285              :     XLogRecPtr  recptr;
    1286              :     RunningTransactions running;
    1287              :     xl_standby_lock *locks;
    1288              :     int         nlocks;
    1289         1461 :     bool        logical_decoding_enabled = IsLogicalDecodingEnabled();
    1290              : 
    1291              :     Assert(XLogStandbyInfoActive());
    1292              : 
    1293              : #ifdef USE_INJECTION_POINTS
    1294         1461 :     if (IS_INJECTION_POINT_ATTACHED("skip-log-running-xacts"))
    1295              :     {
    1296              :         /*
    1297              :          * This record could move slot's xmin forward during decoding, leading
    1298              :          * to unpredictable results, so skip it when requested by the test.
    1299              :          */
    1300            1 :         return GetInsertRecPtr();
    1301              :     }
    1302              : #endif
    1303              : 
    1304              :     /*
    1305              :      * Get details of any AccessExclusiveLocks being held at the moment.
    1306              :      */
    1307         1460 :     locks = GetRunningTransactionLocks(&nlocks);
    1308         1460 :     if (nlocks > 0)
    1309          163 :         LogAccessExclusiveLocks(nlocks, locks);
    1310         1460 :     pfree(locks);
    1311              : 
    1312              :     /*
    1313              :      * Log details of all in-progress transactions. This should be the last
    1314              :      * record we write, because standby will open up when it sees this.
    1315              :      */
    1316         1460 :     running = GetRunningTransactionData();
    1317              : 
    1318              :     /*
    1319              :      * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
    1320              :      * For Hot Standby this can be done before inserting the WAL record
    1321              :      * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
    1322              :      * the clog. For logical decoding, though, the lock can't be released
    1323              :      * early because the clog might be "in the future" from the POV of the
    1324              :      * historic snapshot. This would allow for situations where we're waiting
    1325              :      * for the end of a transaction listed in the xl_running_xacts record
    1326              :      * which, according to the WAL, has committed before the xl_running_xacts
    1327              :      * record. Fortunately this routine isn't executed frequently, and it's
    1328              :      * only a shared lock.
    1329              :      */
    1330         1460 :     if (!logical_decoding_enabled)
    1331          910 :         LWLockRelease(ProcArrayLock);
    1332              : 
    1333         1460 :     recptr = LogCurrentRunningXacts(running);
    1334              : 
    1335              :     /* Release lock if we kept it longer ... */
    1336         1460 :     if (logical_decoding_enabled)
    1337          550 :         LWLockRelease(ProcArrayLock);
    1338              : 
    1339              :     /* GetRunningTransactionData() acquired XidGenLock, we must release it */
    1340         1460 :     LWLockRelease(XidGenLock);
    1341              : 
    1342         1460 :     return recptr;
    1343              : }
    1344              : 
    1345              : /*
    1346              :  * Record an enhanced snapshot of running transactions into WAL.
    1347              :  *
    1348              :  * The definitions of RunningTransactionsData and xl_running_xacts are
    1349              :  * similar. We keep them separate because xl_running_xacts is a contiguous
    1350              :  * chunk of memory and never exists fully until it is assembled in WAL.
    1351              :  * The inserted records are marked as not being important for durability,
    1352              :  * to avoid triggering superfluous checkpoint / archiving activity.
    1353              :  */
    1354              : static XLogRecPtr
    1355         1460 : LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
    1356              : {
    1357              :     xl_running_xacts xlrec;
    1358              :     XLogRecPtr  recptr;
    1359              : 
    1360         1460 :     xlrec.xcnt = CurrRunningXacts->xcnt;
    1361         1460 :     xlrec.subxcnt = CurrRunningXacts->subxcnt;
    1362         1460 :     xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY);
    1363         1460 :     xlrec.nextXid = CurrRunningXacts->nextXid;
    1364         1460 :     xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
    1365         1460 :     xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
    1366              : 
    1367              :     /* Header */
    1368         1460 :     XLogBeginInsert();
    1369         1460 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    1370         1460 :     XLogRegisterData(&xlrec, MinSizeOfXactRunningXacts);
    1371              : 
    1372              :     /* array of TransactionIds */
    1373         1460 :     if (xlrec.xcnt > 0)
    1374          475 :         XLogRegisterData(CurrRunningXacts->xids,
    1375          475 :                          (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
    1376              : 
    1377         1460 :     recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
    1378              : 
    1379         1460 :     if (xlrec.subxid_overflow)
    1380            1 :         elog(DEBUG2,
    1381              :              "snapshot of %d running transactions overflowed (lsn %X/%08X oldest xid %u latest complete %u next xid %u)",
    1382              :              CurrRunningXacts->xcnt,
    1383              :              LSN_FORMAT_ARGS(recptr),
    1384              :              CurrRunningXacts->oldestRunningXid,
    1385              :              CurrRunningXacts->latestCompletedXid,
    1386              :              CurrRunningXacts->nextXid);
    1387              :     else
    1388         1459 :         elog(DEBUG2,
    1389              :              "snapshot of %d+%d running transaction ids (lsn %X/%08X oldest xid %u latest complete %u next xid %u)",
    1390              :              CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
    1391              :              LSN_FORMAT_ARGS(recptr),
    1392              :              CurrRunningXacts->oldestRunningXid,
    1393              :              CurrRunningXacts->latestCompletedXid,
    1394              :              CurrRunningXacts->nextXid);
    1395              : 
    1396              :     /*
    1397              :      * Ensure running_xacts information is synced to disk not too far in the
    1398              :      * future. We don't want to stall anything though (i.e. use XLogFlush()),
    1399              :      * so we let the wal writer do it during normal operation.
    1400              :      * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
    1401              :      * and nudge the WALWriter into action if sleeping. Check
    1402              :      * XLogBackgroundFlush() for details why a record might not be flushed
    1403              :      * without it.
    1404              :      */
    1405         1460 :     XLogSetAsyncXactLSN(recptr);
    1406              : 
    1407         1460 :     return recptr;
    1408              : }
    1409              : 
    1410              : /*
    1411              :  * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
    1412              :  * logged, as described in backend/storage/lmgr/README.
    1413              :  */
    1414              : static void
    1415       136815 : LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
    1416              : {
    1417              :     xl_standby_locks xlrec;
    1418              : 
    1419       136815 :     xlrec.nlocks = nlocks;
    1420              : 
    1421       136815 :     XLogBeginInsert();
    1422       136815 :     XLogRegisterData(&xlrec, offsetof(xl_standby_locks, locks));
    1423       136815 :     XLogRegisterData(locks, nlocks * sizeof(xl_standby_lock));
    1424       136815 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    1425              : 
    1426       136815 :     (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
    1427       136815 : }
    1428              : 
    1429              : /*
    1430              :  * Individual logging of AccessExclusiveLocks for use during LockAcquire()
    1431              :  */
    1432              : void
    1433       136652 : LogAccessExclusiveLock(Oid dbOid, Oid relOid)
    1434              : {
    1435              :     xl_standby_lock xlrec;
    1436              : 
    1437       136652 :     xlrec.xid = GetCurrentTransactionId();
    1438              : 
    1439       136652 :     xlrec.dbOid = dbOid;
    1440       136652 :     xlrec.relOid = relOid;
    1441              : 
    1442       136652 :     LogAccessExclusiveLocks(1, &xlrec);
    1443       136652 :     MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
    1444       136652 : }
    1445              : 
    1446              : /*
    1447              :  * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
    1448              :  */
    1449              : void
    1450       136865 : LogAccessExclusiveLockPrepare(void)
    1451              : {
    1452              :     /*
    1453              :      * Ensure that a TransactionId has been assigned to this transaction, for
    1454              :      * two reasons, both related to lock release on the standby. First, we
    1455              :      * must assign an xid so that RecordTransactionCommit() and
    1456              :      * RecordTransactionAbort() do not optimise away the transaction
    1457              :      * completion record which recovery relies upon to release locks. It's a
    1458              :      * hack, but for a corner case not worth adding code for into the main
    1459              :      * commit path. Second, we must assign an xid before the lock is recorded
    1460              :      * in shared memory, otherwise a concurrently executing
    1461              :      * GetRunningTransactionLocks() might see a lock associated with an
    1462              :      * InvalidTransactionId which we later assert cannot happen.
    1463              :      */
    1464       136865 :     (void) GetCurrentTransactionId();
    1465       136865 : }
    1466              : 
    1467              : /*
    1468              :  * Emit WAL for invalidations. This currently is only used for commits without
    1469              :  * an xid but which contain invalidations.
    1470              :  */
    1471              : void
    1472        10355 : LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
    1473              :                         bool relcacheInitFileInval)
    1474              : {
    1475              :     xl_invalidations xlrec;
    1476              : 
    1477              :     /* prepare record */
    1478        10355 :     memset(&xlrec, 0, sizeof(xlrec));
    1479        10355 :     xlrec.dbId = MyDatabaseId;
    1480        10355 :     xlrec.tsId = MyDatabaseTableSpace;
    1481        10355 :     xlrec.relcacheInitFileInval = relcacheInitFileInval;
    1482        10355 :     xlrec.nmsgs = nmsgs;
    1483              : 
    1484              :     /* perform insertion */
    1485        10355 :     XLogBeginInsert();
    1486        10355 :     XLogRegisterData(&xlrec, MinSizeOfInvalidations);
    1487        10355 :     XLogRegisterData(msgs,
    1488              :                      nmsgs * sizeof(SharedInvalidationMessage));
    1489        10355 :     XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
    1490        10355 : }
    1491              : 
    1492              : /* Return the description of recovery conflict */
    1493              : static const char *
    1494           10 : get_recovery_conflict_desc(RecoveryConflictReason reason)
    1495              : {
    1496           10 :     const char *reasonDesc = _("unknown reason");
    1497              : 
    1498           10 :     switch (reason)
    1499              :     {
    1500            4 :         case RECOVERY_CONFLICT_BUFFERPIN:
    1501            4 :             reasonDesc = _("recovery conflict on buffer pin");
    1502            4 :             break;
    1503            2 :         case RECOVERY_CONFLICT_LOCK:
    1504            2 :             reasonDesc = _("recovery conflict on lock");
    1505            2 :             break;
    1506            2 :         case RECOVERY_CONFLICT_TABLESPACE:
    1507            2 :             reasonDesc = _("recovery conflict on tablespace");
    1508            2 :             break;
    1509            2 :         case RECOVERY_CONFLICT_SNAPSHOT:
    1510            2 :             reasonDesc = _("recovery conflict on snapshot");
    1511            2 :             break;
    1512            0 :         case RECOVERY_CONFLICT_LOGICALSLOT:
    1513            0 :             reasonDesc = _("recovery conflict on replication slot");
    1514            0 :             break;
    1515            0 :         case RECOVERY_CONFLICT_STARTUP_DEADLOCK:
    1516            0 :             reasonDesc = _("recovery conflict on deadlock");
    1517            0 :             break;
    1518            0 :         case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK:
    1519            0 :             reasonDesc = _("recovery conflict on buffer deadlock");
    1520            0 :             break;
    1521            0 :         case RECOVERY_CONFLICT_DATABASE:
    1522            0 :             reasonDesc = _("recovery conflict on database");
    1523            0 :             break;
    1524              :     }
    1525              : 
    1526           10 :     return reasonDesc;
    1527              : }
        

Generated by: LCOV version 2.0-1