LCOV - code coverage report
Current view: top level - src/backend/access/transam - xlogrecovery.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 1242 1435 86.6 %
Date: 2026-01-16 06:16:24 Functions: 68 69 98.6 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * xlogrecovery.c
       4             :  *      Functions for WAL recovery, standby mode
       5             :  *
       6             :  * This source file contains functions controlling WAL recovery.
       7             :  * InitWalRecovery() initializes the system for crash or archive recovery,
       8             :  * or standby mode, depending on configuration options and the state of
       9             :  * the control file and possible backup label file.  PerformWalRecovery()
      10             :  * performs the actual WAL replay, calling the rmgr-specific redo routines.
      11             :  * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
      12             :  * and prepares information needed to initialize the WAL for writes.  In
      13             :  * addition to these three main functions, there are a bunch of functions
      14             :  * for interrogating recovery state and controlling the recovery process.
      15             :  *
      16             :  *
      17             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      18             :  * Portions Copyright (c) 1994, Regents of the University of California
      19             :  *
      20             :  * src/backend/access/transam/xlogrecovery.c
      21             :  *
      22             :  *-------------------------------------------------------------------------
      23             :  */
      24             : 
      25             : #include "postgres.h"
      26             : 
      27             : #include <ctype.h>
      28             : #include <time.h>
      29             : #include <sys/stat.h>
      30             : #include <sys/time.h>
      31             : #include <unistd.h>
      32             : 
      33             : #include "access/timeline.h"
      34             : #include "access/transam.h"
      35             : #include "access/xact.h"
      36             : #include "access/xlog_internal.h"
      37             : #include "access/xlogarchive.h"
      38             : #include "access/xlogprefetcher.h"
      39             : #include "access/xlogreader.h"
      40             : #include "access/xlogrecovery.h"
      41             : #include "access/xlogutils.h"
      42             : #include "access/xlogwait.h"
      43             : #include "backup/basebackup.h"
      44             : #include "catalog/pg_control.h"
      45             : #include "commands/tablespace.h"
      46             : #include "common/file_utils.h"
      47             : #include "miscadmin.h"
      48             : #include "nodes/miscnodes.h"
      49             : #include "pgstat.h"
      50             : #include "postmaster/bgwriter.h"
      51             : #include "postmaster/startup.h"
      52             : #include "replication/slot.h"
      53             : #include "replication/slotsync.h"
      54             : #include "replication/walreceiver.h"
      55             : #include "storage/fd.h"
      56             : #include "storage/ipc.h"
      57             : #include "storage/latch.h"
      58             : #include "storage/pmsignal.h"
      59             : #include "storage/procarray.h"
      60             : #include "storage/spin.h"
      61             : #include "utils/datetime.h"
      62             : #include "utils/fmgrprotos.h"
      63             : #include "utils/guc_hooks.h"
      64             : #include "utils/pgstat_internal.h"
      65             : #include "utils/pg_lsn.h"
      66             : #include "utils/ps_status.h"
      67             : #include "utils/pg_rusage.h"
      68             : 
      69             : /* Unsupported old recovery command file names (relative to $PGDATA) */
      70             : #define RECOVERY_COMMAND_FILE   "recovery.conf"
      71             : #define RECOVERY_COMMAND_DONE   "recovery.done"
      72             : 
      73             : /*
      74             :  * GUC support
      75             :  */
      76             : const struct config_enum_entry recovery_target_action_options[] = {
      77             :     {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
      78             :     {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
      79             :     {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
      80             :     {NULL, 0, false}
      81             : };
      82             : 
      83             : /* options formerly taken from recovery.conf for archive recovery */
      84             : char       *recoveryRestoreCommand = NULL;
      85             : char       *recoveryEndCommand = NULL;
      86             : char       *archiveCleanupCommand = NULL;
      87             : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
      88             : bool        recoveryTargetInclusive = true;
      89             : int         recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
      90             : TransactionId recoveryTargetXid;
      91             : char       *recovery_target_time_string;
      92             : TimestampTz recoveryTargetTime;
      93             : const char *recoveryTargetName;
      94             : XLogRecPtr  recoveryTargetLSN;
      95             : int         recovery_min_apply_delay = 0;
      96             : 
      97             : /* options formerly taken from recovery.conf for XLOG streaming */
      98             : char       *PrimaryConnInfo = NULL;
      99             : char       *PrimarySlotName = NULL;
     100             : bool        wal_receiver_create_temp_slot = false;
     101             : 
     102             : /*
     103             :  * recoveryTargetTimeLineGoal: what the user requested, if any
     104             :  *
     105             :  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
     106             :  *
     107             :  * recoveryTargetTLI: the currently understood target timeline; changes
     108             :  *
     109             :  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
     110             :  * the timelines of its known parents, newest first (so recoveryTargetTLI is
     111             :  * always the first list member).  Only these TLIs are expected to be seen in
     112             :  * the WAL segments we read, and indeed only these TLIs will be considered as
     113             :  * candidate WAL files to open at all.
     114             :  *
     115             :  * curFileTLI: the TLI appearing in the name of the current input WAL file.
     116             :  * (This is not necessarily the same as the timeline from which we are
     117             :  * replaying WAL, which StartupXLOG calls replayTLI, because we could be
     118             :  * scanning data that was copied from an ancestor timeline when the current
     119             :  * file was created.)  During a sequential scan we do not allow this value
     120             :  * to decrease.
     121             :  */
     122             : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
     123             : TimeLineID  recoveryTargetTLIRequested = 0;
     124             : TimeLineID  recoveryTargetTLI = 0;
     125             : static List *expectedTLEs;
     126             : static TimeLineID curFileTLI;
     127             : 
     128             : /*
     129             :  * When ArchiveRecoveryRequested is set, archive recovery was requested,
     130             :  * ie. signal files were present.  When InArchiveRecovery is set, we are
     131             :  * currently recovering using offline XLOG archives.  These variables are only
     132             :  * valid in the startup process.
     133             :  *
     134             :  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
     135             :  * currently performing crash recovery using only XLOG files in pg_wal, but
     136             :  * will switch to using offline XLOG archives as soon as we reach the end of
     137             :  * WAL in pg_wal.
     138             :  */
     139             : bool        ArchiveRecoveryRequested = false;
     140             : bool        InArchiveRecovery = false;
     141             : 
     142             : /*
     143             :  * When StandbyModeRequested is set, standby mode was requested, i.e.
     144             :  * standby.signal file was present.  When StandbyMode is set, we are currently
     145             :  * in standby mode.  These variables are only valid in the startup process.
     146             :  * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
     147             :  */
     148             : static bool StandbyModeRequested = false;
     149             : bool        StandbyMode = false;
     150             : 
     151             : /* was a signal file present at startup? */
     152             : static bool standby_signal_file_found = false;
     153             : static bool recovery_signal_file_found = false;
     154             : 
     155             : /*
     156             :  * CheckPointLoc is the position of the checkpoint record that determines
     157             :  * where to start the replay.  It comes from the backup label file or the
     158             :  * control file.
     159             :  *
     160             :  * RedoStartLSN is the checkpoint's REDO location, also from the backup label
     161             :  * file or the control file.  In standby mode, XLOG streaming usually starts
     162             :  * from the position where an invalid record was found.  But if we fail to
     163             :  * read even the initial checkpoint record, we use the REDO location instead
     164             :  * of the checkpoint location as the start position of XLOG streaming.
     165             :  * Otherwise we would have to jump backwards to the REDO location after
     166             :  * reading the checkpoint record, because the REDO record can precede the
     167             :  * checkpoint record.
     168             :  */
     169             : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
     170             : static TimeLineID CheckPointTLI = 0;
     171             : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
     172             : static TimeLineID RedoStartTLI = 0;
     173             : 
     174             : /*
     175             :  * Local copy of SharedHotStandbyActive variable. False actually means "not
     176             :  * known, need to check the shared state".
     177             :  */
     178             : static bool LocalHotStandbyActive = false;
     179             : 
     180             : /*
     181             :  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
     182             :  * known, need to check the shared state".
     183             :  */
     184             : static bool LocalPromoteIsTriggered = false;
     185             : 
     186             : /* Has the recovery code requested a walreceiver wakeup? */
     187             : static bool doRequestWalReceiverReply;
     188             : 
     189             : /* XLogReader object used to parse the WAL records */
     190             : static XLogReaderState *xlogreader = NULL;
     191             : 
     192             : /* XLogPrefetcher object used to consume WAL records with read-ahead */
     193             : static XLogPrefetcher *xlogprefetcher = NULL;
     194             : 
     195             : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
     196             : typedef struct XLogPageReadPrivate
     197             : {
     198             :     int         emode;
     199             :     bool        fetching_ckpt;  /* are we fetching a checkpoint record? */
     200             :     bool        randAccess;
     201             :     TimeLineID  replayTLI;
     202             : } XLogPageReadPrivate;
     203             : 
     204             : /* flag to tell XLogPageRead that we have started replaying */
     205             : static bool InRedo = false;
     206             : 
     207             : /*
     208             :  * Codes indicating where we got a WAL file from during recovery, or where
     209             :  * to attempt to get one.
     210             :  */
     211             : typedef enum
     212             : {
     213             :     XLOG_FROM_ANY = 0,          /* request to read WAL from any source */
     214             :     XLOG_FROM_ARCHIVE,          /* restored using restore_command */
     215             :     XLOG_FROM_PG_WAL,           /* existing file in pg_wal */
     216             :     XLOG_FROM_STREAM,           /* streamed from primary */
     217             : } XLogSource;
     218             : 
     219             : /* human-readable names for XLogSources, for debugging output */
     220             : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
     221             : 
     222             : /*
     223             :  * readFile is -1 or a kernel FD for the log file segment that's currently
     224             :  * open for reading.  readSegNo identifies the segment.  readOff is the offset
     225             :  * of the page just read, readLen indicates how much of it has been read into
     226             :  * readBuf, and readSource indicates where we got the currently open file from.
     227             :  *
     228             :  * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
     229             :  * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
     230             :  * worthwhile, since the XLOG is not read by general-purpose sessions.
     231             :  */
     232             : static int  readFile = -1;
     233             : static XLogSegNo readSegNo = 0;
     234             : static uint32 readOff = 0;
     235             : static uint32 readLen = 0;
     236             : static XLogSource readSource = XLOG_FROM_ANY;
     237             : 
     238             : /*
     239             :  * Keeps track of which source we're currently reading from. This is
     240             :  * different from readSource in that this is always set, even when we don't
     241             :  * currently have a WAL file open. If lastSourceFailed is set, our last
     242             :  * attempt to read from currentSource failed, and we should try another source
     243             :  * next.
     244             :  *
     245             :  * pendingWalRcvRestart is set when a config change occurs that requires a
     246             :  * walreceiver restart.  This is only valid in XLOG_FROM_STREAM state.
     247             :  */
     248             : static XLogSource currentSource = XLOG_FROM_ANY;
     249             : static bool lastSourceFailed = false;
     250             : static bool pendingWalRcvRestart = false;
     251             : 
     252             : /*
     253             :  * These variables track when we last obtained some WAL data to process,
     254             :  * and where we got it from.  (XLogReceiptSource is initially the same as
     255             :  * readSource, but readSource gets reset to zero when we don't have data
     256             :  * to process right now.  It is also different from currentSource, which
     257             :  * also changes when we try to read from a source and fail, while
     258             :  * XLogReceiptSource tracks where we last successfully read some WAL.)
     259             :  */
     260             : static TimestampTz XLogReceiptTime = 0;
     261             : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
     262             : 
     263             : /* Local copy of WalRcv->flushedUpto */
     264             : static XLogRecPtr flushedUpto = 0;
     265             : static TimeLineID receiveTLI = 0;
     266             : 
     267             : /*
     268             :  * Copy of minRecoveryPoint and backupEndPoint from the control file.
     269             :  *
     270             :  * In order to reach consistency, we must replay the WAL up to
     271             :  * minRecoveryPoint.  If backupEndRequired is true, we must also reach
     272             :  * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
     273             :  * to backupStartPoint.
     274             :  *
     275             :  * Note: In archive recovery, after consistency has been reached, the
     276             :  * functions in xlog.c will start updating minRecoveryPoint in the control
     277             :  * file.  But this copy of minRecoveryPoint variable reflects the value at the
     278             :  * beginning of recovery, and is *not* updated after consistency is reached.
     279             :  */
     280             : static XLogRecPtr minRecoveryPoint;
     281             : static TimeLineID minRecoveryPointTLI;
     282             : 
     283             : static XLogRecPtr backupStartPoint;
     284             : static XLogRecPtr backupEndPoint;
     285             : static bool backupEndRequired = false;
     286             : 
     287             : /*
     288             :  * Have we reached a consistent database state?  In crash recovery, we have
     289             :  * to replay all the WAL, so reachedConsistency is never set.  During archive
     290             :  * recovery, the database is consistent once minRecoveryPoint is reached.
     291             :  *
     292             :  * Consistent state means that the system is internally consistent, all
     293             :  * the WAL has been replayed up to a certain point, and importantly, there
     294             :  * is no trace of later actions on disk.
     295             :  *
     296             :  * This flag is used only by the startup process and postmaster. When
     297             :  * minRecoveryPoint is reached, the startup process sets it to true and
     298             :  * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
     299             :  * which then sets it to true upon receiving the signal.
     300             :  */
     301             : bool        reachedConsistency = false;
     302             : 
     303             : /* Buffers dedicated to consistency checks of size BLCKSZ */
     304             : static char *replay_image_masked = NULL;
     305             : static char *primary_image_masked = NULL;
     306             : 
     307             : 
     308             : /*
     309             :  * Shared-memory state for WAL recovery.
     310             :  */
     311             : typedef struct XLogRecoveryCtlData
     312             : {
     313             :     /*
     314             :      * SharedHotStandbyActive indicates if we allow hot standby queries to be
     315             :      * run.  Protected by info_lck.
     316             :      */
     317             :     bool        SharedHotStandbyActive;
     318             : 
     319             :     /*
     320             :      * SharedPromoteIsTriggered indicates if a standby promotion has been
     321             :      * triggered.  Protected by info_lck.
     322             :      */
     323             :     bool        SharedPromoteIsTriggered;
     324             : 
     325             :     /*
     326             :      * recoveryWakeupLatch is used to wake up the startup process to continue
     327             :      * WAL replay, if it is waiting for WAL to arrive or promotion to be
     328             :      * requested.
     329             :      *
     330             :      * Note that the startup process also uses another latch, its procLatch,
     331             :      * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
     332             :      * signaling the startup process in favor of using its procLatch, which
     333             :      * comports better with possible generic signal handlers using that latch.
     334             :      * But we should not do that because the startup process doesn't assume
     335             :      * that it's waken up by walreceiver process or SIGHUP signal handler
     336             :      * while it's waiting for recovery conflict. The separate latches,
     337             :      * recoveryWakeupLatch and procLatch, should be used for inter-process
     338             :      * communication for WAL replay and recovery conflict, respectively.
     339             :      */
     340             :     Latch       recoveryWakeupLatch;
     341             : 
     342             :     /*
     343             :      * Last record successfully replayed.
     344             :      */
     345             :     XLogRecPtr  lastReplayedReadRecPtr; /* start position */
     346             :     XLogRecPtr  lastReplayedEndRecPtr;  /* end+1 position */
     347             :     TimeLineID  lastReplayedTLI;    /* timeline */
     348             : 
     349             :     /*
     350             :      * When we're currently replaying a record, ie. in a redo function,
     351             :      * replayEndRecPtr points to the end+1 of the record being replayed,
     352             :      * otherwise it's equal to lastReplayedEndRecPtr.
     353             :      */
     354             :     XLogRecPtr  replayEndRecPtr;
     355             :     TimeLineID  replayEndTLI;
     356             :     /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
     357             :     TimestampTz recoveryLastXTime;
     358             : 
     359             :     /*
     360             :      * timestamp of when we started replaying the current chunk of WAL data,
     361             :      * only relevant for replication or archive recovery
     362             :      */
     363             :     TimestampTz currentChunkStartTime;
     364             :     /* Recovery pause state */
     365             :     RecoveryPauseState recoveryPauseState;
     366             :     ConditionVariable recoveryNotPausedCV;
     367             : 
     368             :     slock_t     info_lck;       /* locks shared variables shown above */
     369             : } XLogRecoveryCtlData;
     370             : 
     371             : static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
     372             : 
     373             : /*
     374             :  * abortedRecPtr is the start pointer of a broken record at end of WAL when
     375             :  * recovery completes; missingContrecPtr is the location of the first
     376             :  * contrecord that went missing.  See CreateOverwriteContrecordRecord for
     377             :  * details.
     378             :  */
     379             : static XLogRecPtr abortedRecPtr;
     380             : static XLogRecPtr missingContrecPtr;
     381             : 
     382             : /*
     383             :  * if recoveryStopsBefore/After returns true, it saves information of the stop
     384             :  * point here
     385             :  */
     386             : static TransactionId recoveryStopXid;
     387             : static TimestampTz recoveryStopTime;
     388             : static XLogRecPtr recoveryStopLSN;
     389             : static char recoveryStopName[MAXFNAMELEN];
     390             : static bool recoveryStopAfter;
     391             : 
     392             : /* prototypes for local functions */
     393             : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
     394             : 
     395             : static void EnableStandbyMode(void);
     396             : static void readRecoverySignalFile(void);
     397             : static void validateRecoveryParameters(void);
     398             : static bool read_backup_label(XLogRecPtr *checkPointLoc,
     399             :                               TimeLineID *backupLabelTLI,
     400             :                               bool *backupEndRequired, bool *backupFromStandby);
     401             : static bool read_tablespace_map(List **tablespaces);
     402             : 
     403             : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
     404             : static void CheckRecoveryConsistency(void);
     405             : static void rm_redo_error_callback(void *arg);
     406             : #ifdef WAL_DEBUG
     407             : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
     408             : #endif
     409             : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
     410             : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
     411             :                                 TimeLineID prevTLI, TimeLineID replayTLI);
     412             : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
     413             : static void verifyBackupPageConsistency(XLogReaderState *record);
     414             : 
     415             : static bool recoveryStopsBefore(XLogReaderState *record);
     416             : static bool recoveryStopsAfter(XLogReaderState *record);
     417             : static char *getRecoveryStopReason(void);
     418             : static void recoveryPausesHere(bool endOfRecovery);
     419             : static bool recoveryApplyDelay(XLogReaderState *record);
     420             : static void ConfirmRecoveryPaused(void);
     421             : 
     422             : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
     423             :                               int emode, bool fetching_ckpt,
     424             :                               TimeLineID replayTLI);
     425             : 
     426             : static int  XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
     427             :                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
     428             : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
     429             :                                                       bool randAccess,
     430             :                                                       bool fetching_ckpt,
     431             :                                                       XLogRecPtr tliRecPtr,
     432             :                                                       TimeLineID replayTLI,
     433             :                                                       XLogRecPtr replayLSN,
     434             :                                                       bool nonblocking);
     435             : static int  emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
     436             : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
     437             :                                         XLogRecPtr RecPtr, TimeLineID replayTLI);
     438             : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
     439             : static int  XLogFileRead(XLogSegNo segno, TimeLineID tli,
     440             :                          XLogSource source, bool notfoundOk);
     441             : static int  XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
     442             : 
     443             : static bool CheckForStandbyTrigger(void);
     444             : static void SetPromoteIsTriggered(void);
     445             : static bool HotStandbyActiveInReplay(void);
     446             : 
     447             : static void SetCurrentChunkStartTime(TimestampTz xtime);
     448             : static void SetLatestXTime(TimestampTz xtime);
     449             : 
     450             : /*
     451             :  * Initialization of shared memory for WAL recovery
     452             :  */
     453             : Size
     454        6510 : XLogRecoveryShmemSize(void)
     455             : {
     456             :     Size        size;
     457             : 
     458             :     /* XLogRecoveryCtl */
     459        6510 :     size = sizeof(XLogRecoveryCtlData);
     460             : 
     461        6510 :     return size;
     462             : }
     463             : 
     464             : void
     465        2272 : XLogRecoveryShmemInit(void)
     466             : {
     467             :     bool        found;
     468             : 
     469        2272 :     XLogRecoveryCtl = (XLogRecoveryCtlData *)
     470        2272 :         ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
     471        2272 :     if (found)
     472           0 :         return;
     473        2272 :     memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
     474             : 
     475        2272 :     SpinLockInit(&XLogRecoveryCtl->info_lck);
     476        2272 :     InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
     477        2272 :     ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
     478             : }
     479             : 
     480             : /*
     481             :  * A thin wrapper to enable StandbyMode and do other preparatory work as
     482             :  * needed.
     483             :  */
     484             : static void
     485         224 : EnableStandbyMode(void)
     486             : {
     487         224 :     StandbyMode = true;
     488             : 
     489             :     /*
     490             :      * To avoid server log bloat, we don't report recovery progress in a
     491             :      * standby as it will always be in recovery unless promoted. We disable
     492             :      * startup progress timeout in standby mode to avoid calling
     493             :      * startup_progress_timeout_handler() unnecessarily.
     494             :      */
     495         224 :     disable_startup_progress_timeout();
     496         224 : }
     497             : 
     498             : /*
     499             :  * Prepare the system for WAL recovery, if needed.
     500             :  *
     501             :  * This is called by StartupXLOG() which coordinates the server startup
     502             :  * sequence.  This function analyzes the control file and the backup label
     503             :  * file, if any, and figures out whether we need to perform crash recovery or
     504             :  * archive recovery, and how far we need to replay the WAL to reach a
     505             :  * consistent state.
     506             :  *
     507             :  * This doesn't yet change the on-disk state, except for creating the symlinks
     508             :  * from table space map file if any, and for fetching WAL files needed to find
     509             :  * the checkpoint record.  On entry, the caller has already read the control
     510             :  * file into memory, and passes it as argument.  This function updates it to
     511             :  * reflect the recovery state, and the caller is expected to write it back to
     512             :  * disk does after initializing other subsystems, but before calling
     513             :  * PerformWalRecovery().
     514             :  *
     515             :  * This initializes some global variables like ArchiveRecoveryRequested, and
     516             :  * StandbyModeRequested and InRecovery.
     517             :  */
     518             : void
     519        1984 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
     520             :                 bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
     521             : {
     522             :     XLogPageReadPrivate *private;
     523             :     struct stat st;
     524             :     bool        wasShutdown;
     525             :     XLogRecord *record;
     526             :     DBState     dbstate_at_startup;
     527        1984 :     bool        haveTblspcMap = false;
     528        1984 :     bool        haveBackupLabel = false;
     529             :     CheckPoint  checkPoint;
     530        1984 :     bool        backupFromStandby = false;
     531             : 
     532        1984 :     dbstate_at_startup = ControlFile->state;
     533             : 
     534             :     /*
     535             :      * Initialize on the assumption we want to recover to the latest timeline
     536             :      * that's active according to pg_control.
     537             :      */
     538        1984 :     if (ControlFile->minRecoveryPointTLI >
     539        1984 :         ControlFile->checkPointCopy.ThisTimeLineID)
     540           4 :         recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
     541             :     else
     542        1980 :         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
     543             : 
     544             :     /*
     545             :      * Check for signal files, and if so set up state for offline recovery
     546             :      */
     547        1984 :     readRecoverySignalFile();
     548        1984 :     validateRecoveryParameters();
     549             : 
     550             :     /*
     551             :      * Take ownership of the wakeup latch if we're going to sleep during
     552             :      * recovery, if required.
     553             :      */
     554        1984 :     if (ArchiveRecoveryRequested)
     555         234 :         OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
     556             : 
     557             :     /*
     558             :      * Set the WAL reading processor now, as it will be needed when reading
     559             :      * the checkpoint record required (backup_label or not).
     560             :      */
     561        1984 :     private = palloc0_object(XLogPageReadPrivate);
     562        1984 :     xlogreader =
     563        1984 :         XLogReaderAllocate(wal_segment_size, NULL,
     564        1984 :                            XL_ROUTINE(.page_read = &XLogPageRead,
     565             :                                       .segment_open = NULL,
     566             :                                       .segment_close = wal_segment_close),
     567             :                            private);
     568        1984 :     if (!xlogreader)
     569           0 :         ereport(ERROR,
     570             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     571             :                  errmsg("out of memory"),
     572             :                  errdetail("Failed while allocating a WAL reading processor.")));
     573        1984 :     xlogreader->system_identifier = ControlFile->system_identifier;
     574             : 
     575             :     /*
     576             :      * Set the WAL decode buffer size.  This limits how far ahead we can read
     577             :      * in the WAL.
     578             :      */
     579        1984 :     XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
     580             : 
     581             :     /* Create a WAL prefetcher. */
     582        1984 :     xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
     583             : 
     584             :     /*
     585             :      * Allocate two page buffers dedicated to WAL consistency checks.  We do
     586             :      * it this way, rather than just making static arrays, for two reasons:
     587             :      * (1) no need to waste the storage in most instantiations of the backend;
     588             :      * (2) a static char array isn't guaranteed to have any particular
     589             :      * alignment, whereas palloc() will provide MAXALIGN'd storage.
     590             :      */
     591        1984 :     replay_image_masked = (char *) palloc(BLCKSZ);
     592        1984 :     primary_image_masked = (char *) palloc(BLCKSZ);
     593             : 
     594             :     /*
     595             :      * Read the backup_label file.  We want to run this part of the recovery
     596             :      * process after checking for signal files and after performing validation
     597             :      * of the recovery parameters.
     598             :      */
     599        1984 :     if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
     600             :                           &backupFromStandby))
     601             :     {
     602         158 :         List       *tablespaces = NIL;
     603             : 
     604             :         /*
     605             :          * Archive recovery was requested, and thanks to the backup label
     606             :          * file, we know how far we need to replay to reach consistency. Enter
     607             :          * archive recovery directly.
     608             :          */
     609         158 :         InArchiveRecovery = true;
     610         158 :         if (StandbyModeRequested)
     611         136 :             EnableStandbyMode();
     612             : 
     613             :         /*
     614             :          * Omitting backup_label when creating a new replica, PITR node etc.
     615             :          * unfortunately is a common cause of corruption.  Logging that
     616             :          * backup_label was used makes it a bit easier to exclude that as the
     617             :          * cause of observed corruption.
     618             :          *
     619             :          * Do so before we try to read the checkpoint record (which can fail),
     620             :          * as otherwise it can be hard to understand why a checkpoint other
     621             :          * than ControlFile->checkPoint is used.
     622             :          */
     623         158 :         ereport(LOG,
     624             :                 errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
     625             :                        LSN_FORMAT_ARGS(RedoStartLSN),
     626             :                        LSN_FORMAT_ARGS(CheckPointLoc),
     627             :                        CheckPointTLI));
     628             : 
     629             :         /*
     630             :          * When a backup_label file is present, we want to roll forward from
     631             :          * the checkpoint it identifies, rather than using pg_control.
     632             :          */
     633         158 :         record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
     634             :                                       CheckPointTLI);
     635         158 :         if (record != NULL)
     636             :         {
     637         158 :             memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
     638         158 :             wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
     639         158 :             ereport(DEBUG1,
     640             :                     errmsg_internal("checkpoint record is at %X/%08X",
     641             :                                     LSN_FORMAT_ARGS(CheckPointLoc)));
     642         158 :             InRecovery = true;  /* force recovery even if SHUTDOWNED */
     643             : 
     644             :             /*
     645             :              * Make sure that REDO location exists. This may not be the case
     646             :              * if there was a crash during an online backup, which left a
     647             :              * backup_label around that references a WAL segment that's
     648             :              * already been archived.
     649             :              */
     650         158 :             if (checkPoint.redo < CheckPointLoc)
     651             :             {
     652         158 :                 XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
     653         158 :                 if (!ReadRecord(xlogprefetcher, LOG, false,
     654             :                                 checkPoint.ThisTimeLineID))
     655           0 :                     ereport(FATAL,
     656             :                             errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
     657             :                                    LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
     658             :                             errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
     659             :                                     "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
     660             :                                     "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
     661             :                                     DataDir, DataDir, DataDir, DataDir));
     662             :             }
     663             :         }
     664             :         else
     665             :         {
     666           0 :             ereport(FATAL,
     667             :                     errmsg("could not locate required checkpoint record at %X/%08X",
     668             :                            LSN_FORMAT_ARGS(CheckPointLoc)),
     669             :                     errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
     670             :                             "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
     671             :                             "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
     672             :                             DataDir, DataDir, DataDir, DataDir));
     673             :             wasShutdown = false;    /* keep compiler quiet */
     674             :         }
     675             : 
     676             :         /* Read the tablespace_map file if present and create symlinks. */
     677         158 :         if (read_tablespace_map(&tablespaces))
     678             :         {
     679             :             ListCell   *lc;
     680             : 
     681           8 :             foreach(lc, tablespaces)
     682             :             {
     683           4 :                 tablespaceinfo *ti = lfirst(lc);
     684             :                 char       *linkloc;
     685             : 
     686           4 :                 linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
     687             : 
     688             :                 /*
     689             :                  * Remove the existing symlink if any and Create the symlink
     690             :                  * under PGDATA.
     691             :                  */
     692           4 :                 remove_tablespace_symlink(linkloc);
     693             : 
     694           4 :                 if (symlink(ti->path, linkloc) < 0)
     695           0 :                     ereport(ERROR,
     696             :                             (errcode_for_file_access(),
     697             :                              errmsg("could not create symbolic link \"%s\": %m",
     698             :                                     linkloc)));
     699             : 
     700           4 :                 pfree(ti->path);
     701           4 :                 pfree(ti);
     702             :             }
     703             : 
     704             :             /* tell the caller to delete it later */
     705           4 :             haveTblspcMap = true;
     706             :         }
     707             : 
     708             :         /* tell the caller to delete it later */
     709         158 :         haveBackupLabel = true;
     710             :     }
     711             :     else
     712             :     {
     713             :         /* No backup_label file has been found if we are here. */
     714             : 
     715             :         /*
     716             :          * If tablespace_map file is present without backup_label file, there
     717             :          * is no use of such file.  There is no harm in retaining it, but it
     718             :          * is better to get rid of the map file so that we don't have any
     719             :          * redundant file in data directory and it will avoid any sort of
     720             :          * confusion.  It seems prudent though to just rename the file out of
     721             :          * the way rather than delete it completely, also we ignore any error
     722             :          * that occurs in rename operation as even if map file is present
     723             :          * without backup_label file, it is harmless.
     724             :          */
     725        1826 :         if (stat(TABLESPACE_MAP, &st) == 0)
     726             :         {
     727           2 :             unlink(TABLESPACE_MAP_OLD);
     728           2 :             if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
     729           2 :                 ereport(LOG,
     730             :                         (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
     731             :                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
     732             :                          errdetail("File \"%s\" was renamed to \"%s\".",
     733             :                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
     734             :             else
     735           0 :                 ereport(LOG,
     736             :                         (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
     737             :                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
     738             :                          errdetail("Could not rename file \"%s\" to \"%s\": %m.",
     739             :                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
     740             :         }
     741             : 
     742             :         /*
     743             :          * It's possible that archive recovery was requested, but we don't
     744             :          * know how far we need to replay the WAL before we reach consistency.
     745             :          * This can happen for example if a base backup is taken from a
     746             :          * running server using an atomic filesystem snapshot, without calling
     747             :          * pg_backup_start/stop. Or if you just kill a running primary server
     748             :          * and put it into archive recovery by creating a recovery signal
     749             :          * file.
     750             :          *
     751             :          * Our strategy in that case is to perform crash recovery first,
     752             :          * replaying all the WAL present in pg_wal, and only enter archive
     753             :          * recovery after that.
     754             :          *
     755             :          * But usually we already know how far we need to replay the WAL (up
     756             :          * to minRecoveryPoint, up to backupEndPoint, or until we see an
     757             :          * end-of-backup record), and we can enter archive recovery directly.
     758             :          */
     759        1826 :         if (ArchiveRecoveryRequested &&
     760          88 :             (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) ||
     761          18 :              ControlFile->backupEndRequired ||
     762          18 :              XLogRecPtrIsValid(ControlFile->backupEndPoint) ||
     763          18 :              ControlFile->state == DB_SHUTDOWNED))
     764             :         {
     765          84 :             InArchiveRecovery = true;
     766          84 :             if (StandbyModeRequested)
     767          84 :                 EnableStandbyMode();
     768             :         }
     769             : 
     770             :         /*
     771             :          * For the same reason as when starting up with backup_label present,
     772             :          * emit a log message when we continue initializing from a base
     773             :          * backup.
     774             :          */
     775        1826 :         if (XLogRecPtrIsValid(ControlFile->backupStartPoint))
     776           0 :             ereport(LOG,
     777             :                     errmsg("restarting backup recovery with redo LSN %X/%08X",
     778             :                            LSN_FORMAT_ARGS(ControlFile->backupStartPoint)));
     779             : 
     780             :         /* Get the last valid checkpoint record. */
     781        1826 :         CheckPointLoc = ControlFile->checkPoint;
     782        1826 :         CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
     783        1826 :         RedoStartLSN = ControlFile->checkPointCopy.redo;
     784        1826 :         RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
     785        1826 :         record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
     786             :                                       CheckPointTLI);
     787        1826 :         if (record != NULL)
     788             :         {
     789        1826 :             ereport(DEBUG1,
     790             :                     errmsg_internal("checkpoint record is at %X/%08X",
     791             :                                     LSN_FORMAT_ARGS(CheckPointLoc)));
     792             :         }
     793             :         else
     794             :         {
     795             :             /*
     796             :              * We used to attempt to go back to a secondary checkpoint record
     797             :              * here, but only when not in standby mode. We now just fail if we
     798             :              * can't read the last checkpoint because this allows us to
     799             :              * simplify processing around checkpoints.
     800             :              */
     801           0 :             ereport(PANIC,
     802             :                     errmsg("could not locate a valid checkpoint record at %X/%08X",
     803             :                            LSN_FORMAT_ARGS(CheckPointLoc)));
     804             :         }
     805        1826 :         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
     806        1826 :         wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
     807             : 
     808             :         /* Make sure that REDO location exists. */
     809        1826 :         if (checkPoint.redo < CheckPointLoc)
     810             :         {
     811          88 :             XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
     812          88 :             if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
     813           2 :                 ereport(FATAL,
     814             :                         errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
     815             :                                LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)));
     816             :         }
     817             :     }
     818             : 
     819        1982 :     if (ArchiveRecoveryRequested)
     820             :     {
     821         234 :         if (StandbyModeRequested)
     822         224 :             ereport(LOG,
     823             :                     (errmsg("entering standby mode")));
     824          10 :         else if (recoveryTarget == RECOVERY_TARGET_XID)
     825           0 :             ereport(LOG,
     826             :                     (errmsg("starting point-in-time recovery to XID %u",
     827             :                             recoveryTargetXid)));
     828          10 :         else if (recoveryTarget == RECOVERY_TARGET_TIME)
     829           0 :             ereport(LOG,
     830             :                     (errmsg("starting point-in-time recovery to %s",
     831             :                             timestamptz_to_str(recoveryTargetTime))));
     832          10 :         else if (recoveryTarget == RECOVERY_TARGET_NAME)
     833           6 :             ereport(LOG,
     834             :                     (errmsg("starting point-in-time recovery to \"%s\"",
     835             :                             recoveryTargetName)));
     836           4 :         else if (recoveryTarget == RECOVERY_TARGET_LSN)
     837           0 :             ereport(LOG,
     838             :                     errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
     839             :                            LSN_FORMAT_ARGS(recoveryTargetLSN)));
     840           4 :         else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
     841           0 :             ereport(LOG,
     842             :                     (errmsg("starting point-in-time recovery to earliest consistent point")));
     843             :         else
     844           4 :             ereport(LOG,
     845             :                     (errmsg("starting archive recovery")));
     846             :     }
     847             : 
     848             :     /*
     849             :      * If the location of the checkpoint record is not on the expected
     850             :      * timeline in the history of the requested timeline, we cannot proceed:
     851             :      * the backup is not part of the history of the requested timeline.
     852             :      */
     853             :     Assert(expectedTLEs);       /* was initialized by reading checkpoint
     854             :                                  * record */
     855        1982 :     if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
     856             :         CheckPointTLI)
     857             :     {
     858             :         XLogRecPtr  switchpoint;
     859             : 
     860             :         /*
     861             :          * tliSwitchPoint will throw an error if the checkpoint's timeline is
     862             :          * not in expectedTLEs at all.
     863             :          */
     864           0 :         switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
     865           0 :         ereport(FATAL,
     866             :                 (errmsg("requested timeline %u is not a child of this server's history",
     867             :                         recoveryTargetTLI),
     868             :         /* translator: %s is a backup_label file or a pg_control file */
     869             :                  errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
     870             :                            haveBackupLabel ? "backup_label" : "pg_control",
     871             :                            LSN_FORMAT_ARGS(CheckPointLoc),
     872             :                            CheckPointTLI,
     873             :                            LSN_FORMAT_ARGS(switchpoint))));
     874             :     }
     875             : 
     876             :     /*
     877             :      * The min recovery point should be part of the requested timeline's
     878             :      * history, too.
     879             :      */
     880        1982 :     if (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) &&
     881          84 :         tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
     882          84 :         ControlFile->minRecoveryPointTLI)
     883           0 :         ereport(FATAL,
     884             :                 errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
     885             :                        recoveryTargetTLI,
     886             :                        LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
     887             :                        ControlFile->minRecoveryPointTLI));
     888             : 
     889        1982 :     ereport(DEBUG1,
     890             :             errmsg_internal("redo record is at %X/%08X; shutdown %s",
     891             :                             LSN_FORMAT_ARGS(checkPoint.redo),
     892             :                             wasShutdown ? "true" : "false"));
     893        1982 :     ereport(DEBUG1,
     894             :             (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
     895             :                              U64FromFullTransactionId(checkPoint.nextXid),
     896             :                              checkPoint.nextOid)));
     897        1982 :     ereport(DEBUG1,
     898             :             (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64,
     899             :                              checkPoint.nextMulti, checkPoint.nextMultiOffset)));
     900        1982 :     ereport(DEBUG1,
     901             :             (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
     902             :                              checkPoint.oldestXid, checkPoint.oldestXidDB)));
     903        1982 :     ereport(DEBUG1,
     904             :             (errmsg_internal("oldest MultiXactId: %u, in database %u",
     905             :                              checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
     906        1982 :     ereport(DEBUG1,
     907             :             (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
     908             :                              checkPoint.oldestCommitTsXid,
     909             :                              checkPoint.newestCommitTsXid)));
     910        1982 :     if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
     911           0 :         ereport(PANIC,
     912             :                 (errmsg("invalid next transaction ID")));
     913             : 
     914             :     /* sanity check */
     915        1982 :     if (checkPoint.redo > CheckPointLoc)
     916           0 :         ereport(PANIC,
     917             :                 (errmsg("invalid redo in checkpoint record")));
     918             : 
     919             :     /*
     920             :      * Check whether we need to force recovery from WAL.  If it appears to
     921             :      * have been a clean shutdown and we did not have a recovery signal file,
     922             :      * then assume no recovery needed.
     923             :      */
     924        1982 :     if (checkPoint.redo < CheckPointLoc)
     925             :     {
     926         244 :         if (wasShutdown)
     927           0 :             ereport(PANIC,
     928             :                     (errmsg("invalid redo record in shutdown checkpoint")));
     929         244 :         InRecovery = true;
     930             :     }
     931        1738 :     else if (ControlFile->state != DB_SHUTDOWNED)
     932         190 :         InRecovery = true;
     933        1548 :     else if (ArchiveRecoveryRequested)
     934             :     {
     935             :         /* force recovery due to presence of recovery signal file */
     936          14 :         InRecovery = true;
     937             :     }
     938             : 
     939             :     /*
     940             :      * If recovery is needed, update our in-memory copy of pg_control to show
     941             :      * that we are recovering and to show the selected checkpoint as the place
     942             :      * we are starting from. We also mark pg_control with any minimum recovery
     943             :      * stop point obtained from a backup history file.
     944             :      *
     945             :      * We don't write the changes to disk yet, though. Only do that after
     946             :      * initializing various subsystems.
     947             :      */
     948        1982 :     if (InRecovery)
     949             :     {
     950         448 :         if (InArchiveRecovery)
     951             :         {
     952         242 :             ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
     953             :         }
     954             :         else
     955             :         {
     956         206 :             ereport(LOG,
     957             :                     (errmsg("database system was not properly shut down; "
     958             :                             "automatic recovery in progress")));
     959         206 :             if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
     960           4 :                 ereport(LOG,
     961             :                         (errmsg("crash recovery starts in timeline %u "
     962             :                                 "and has target timeline %u",
     963             :                                 ControlFile->checkPointCopy.ThisTimeLineID,
     964             :                                 recoveryTargetTLI)));
     965         206 :             ControlFile->state = DB_IN_CRASH_RECOVERY;
     966             :         }
     967         448 :         ControlFile->checkPoint = CheckPointLoc;
     968         448 :         ControlFile->checkPointCopy = checkPoint;
     969         448 :         if (InArchiveRecovery)
     970             :         {
     971             :             /* initialize minRecoveryPoint if not set yet */
     972         242 :             if (ControlFile->minRecoveryPoint < checkPoint.redo)
     973             :             {
     974         162 :                 ControlFile->minRecoveryPoint = checkPoint.redo;
     975         162 :                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
     976             :             }
     977             :         }
     978             : 
     979             :         /*
     980             :          * Set backupStartPoint if we're starting recovery from a base backup.
     981             :          *
     982             :          * Also set backupEndPoint and use minRecoveryPoint as the backup end
     983             :          * location if we're starting recovery from a base backup which was
     984             :          * taken from a standby. In this case, the database system status in
     985             :          * pg_control must indicate that the database was already in recovery.
     986             :          * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
     987             :          * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
     988             :          * before reaching this point; e.g. because restore_command or
     989             :          * primary_conninfo were faulty.
     990             :          *
     991             :          * Any other state indicates that the backup somehow became corrupted
     992             :          * and we can't sensibly continue with recovery.
     993             :          */
     994         448 :         if (haveBackupLabel)
     995             :         {
     996         158 :             ControlFile->backupStartPoint = checkPoint.redo;
     997         158 :             ControlFile->backupEndRequired = backupEndRequired;
     998             : 
     999         158 :             if (backupFromStandby)
    1000             :             {
    1001          10 :                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
    1002             :                     dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
    1003           0 :                     ereport(FATAL,
    1004             :                             (errmsg("backup_label contains data inconsistent with control file"),
    1005             :                              errhint("This means that the backup is corrupted and you will "
    1006             :                                      "have to use another backup for recovery.")));
    1007          10 :                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
    1008             :             }
    1009             :         }
    1010             :     }
    1011             : 
    1012             :     /* remember these, so that we know when we have reached consistency */
    1013        1982 :     backupStartPoint = ControlFile->backupStartPoint;
    1014        1982 :     backupEndRequired = ControlFile->backupEndRequired;
    1015        1982 :     backupEndPoint = ControlFile->backupEndPoint;
    1016        1982 :     if (InArchiveRecovery)
    1017             :     {
    1018         242 :         minRecoveryPoint = ControlFile->minRecoveryPoint;
    1019         242 :         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    1020             :     }
    1021             :     else
    1022             :     {
    1023        1740 :         minRecoveryPoint = InvalidXLogRecPtr;
    1024        1740 :         minRecoveryPointTLI = 0;
    1025             :     }
    1026             : 
    1027             :     /*
    1028             :      * Start recovery assuming that the final record isn't lost.
    1029             :      */
    1030        1982 :     abortedRecPtr = InvalidXLogRecPtr;
    1031        1982 :     missingContrecPtr = InvalidXLogRecPtr;
    1032             : 
    1033        1982 :     *wasShutdown_ptr = wasShutdown;
    1034        1982 :     *haveBackupLabel_ptr = haveBackupLabel;
    1035        1982 :     *haveTblspcMap_ptr = haveTblspcMap;
    1036        1982 : }
    1037             : 
    1038             : /*
    1039             :  * See if there are any recovery signal files and if so, set state for
    1040             :  * recovery.
    1041             :  *
    1042             :  * See if there is a recovery command file (recovery.conf), and if so
    1043             :  * throw an ERROR since as of PG12 we no longer recognize that.
    1044             :  */
    1045             : static void
    1046        1984 : readRecoverySignalFile(void)
    1047             : {
    1048             :     struct stat stat_buf;
    1049             : 
    1050        1984 :     if (IsBootstrapProcessingMode())
    1051        1750 :         return;
    1052             : 
    1053             :     /*
    1054             :      * Check for old recovery API file: recovery.conf
    1055             :      */
    1056        1882 :     if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
    1057           0 :         ereport(FATAL,
    1058             :                 (errcode_for_file_access(),
    1059             :                  errmsg("using recovery command file \"%s\" is not supported",
    1060             :                         RECOVERY_COMMAND_FILE)));
    1061             : 
    1062             :     /*
    1063             :      * Remove unused .done file, if present. Ignore if absent.
    1064             :      */
    1065        1882 :     unlink(RECOVERY_COMMAND_DONE);
    1066             : 
    1067             :     /*
    1068             :      * Check for recovery signal files and if found, fsync them since they
    1069             :      * represent server state information.  We don't sweat too much about the
    1070             :      * possibility of fsync failure, however.
    1071             :      *
    1072             :      * If present, standby signal file takes precedence. If neither is present
    1073             :      * then we won't enter archive recovery.
    1074             :      */
    1075        1882 :     if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
    1076             :     {
    1077             :         int         fd;
    1078             : 
    1079         224 :         fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
    1080             :                                S_IRUSR | S_IWUSR);
    1081         224 :         if (fd >= 0)
    1082             :         {
    1083         224 :             (void) pg_fsync(fd);
    1084         224 :             close(fd);
    1085             :         }
    1086         224 :         standby_signal_file_found = true;
    1087             :     }
    1088        1658 :     else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
    1089             :     {
    1090             :         int         fd;
    1091             : 
    1092          10 :         fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
    1093             :                                S_IRUSR | S_IWUSR);
    1094          10 :         if (fd >= 0)
    1095             :         {
    1096          10 :             (void) pg_fsync(fd);
    1097          10 :             close(fd);
    1098             :         }
    1099          10 :         recovery_signal_file_found = true;
    1100             :     }
    1101             : 
    1102        1882 :     StandbyModeRequested = false;
    1103        1882 :     ArchiveRecoveryRequested = false;
    1104        1882 :     if (standby_signal_file_found)
    1105             :     {
    1106         224 :         StandbyModeRequested = true;
    1107         224 :         ArchiveRecoveryRequested = true;
    1108             :     }
    1109        1658 :     else if (recovery_signal_file_found)
    1110             :     {
    1111          10 :         StandbyModeRequested = false;
    1112          10 :         ArchiveRecoveryRequested = true;
    1113             :     }
    1114             :     else
    1115        1648 :         return;
    1116             : 
    1117             :     /*
    1118             :      * We don't support standby mode in standalone backends; that requires
    1119             :      * other processes such as the WAL receiver to be alive.
    1120             :      */
    1121         234 :     if (StandbyModeRequested && !IsUnderPostmaster)
    1122           0 :         ereport(FATAL,
    1123             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1124             :                  errmsg("standby mode is not supported by single-user servers")));
    1125             : }
    1126             : 
    1127             : static void
    1128        1984 : validateRecoveryParameters(void)
    1129             : {
    1130        1984 :     if (!ArchiveRecoveryRequested)
    1131        1750 :         return;
    1132             : 
    1133             :     /*
    1134             :      * Check for compulsory parameters
    1135             :      */
    1136         234 :     if (StandbyModeRequested)
    1137             :     {
    1138         224 :         if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
    1139          22 :             (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
    1140           4 :             ereport(WARNING,
    1141             :                     (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
    1142             :                      errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
    1143             :     }
    1144             :     else
    1145             :     {
    1146          10 :         if (recoveryRestoreCommand == NULL ||
    1147          10 :             strcmp(recoveryRestoreCommand, "") == 0)
    1148           0 :             ereport(FATAL,
    1149             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1150             :                      errmsg("must specify \"restore_command\" when standby mode is not enabled")));
    1151             :     }
    1152             : 
    1153             :     /*
    1154             :      * Override any inconsistent requests. Note that this is a change of
    1155             :      * behaviour in 9.5; prior to this we simply ignored a request to pause if
    1156             :      * hot_standby = off, which was surprising behaviour.
    1157             :      */
    1158         234 :     if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
    1159         220 :         !EnableHotStandby)
    1160           6 :         recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
    1161             : 
    1162             :     /*
    1163             :      * Final parsing of recovery_target_time string; see also
    1164             :      * check_recovery_target_time().
    1165             :      */
    1166         234 :     if (recoveryTarget == RECOVERY_TARGET_TIME)
    1167             :     {
    1168           0 :         recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
    1169             :                                                                      CStringGetDatum(recovery_target_time_string),
    1170             :                                                                      ObjectIdGetDatum(InvalidOid),
    1171             :                                                                      Int32GetDatum(-1)));
    1172             :     }
    1173             : 
    1174             :     /*
    1175             :      * If user specified recovery_target_timeline, validate it or compute the
    1176             :      * "latest" value.  We can't do this until after we've gotten the restore
    1177             :      * command and set InArchiveRecovery, because we need to fetch timeline
    1178             :      * history files from the archive.
    1179             :      */
    1180         234 :     if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
    1181             :     {
    1182           0 :         TimeLineID  rtli = recoveryTargetTLIRequested;
    1183             : 
    1184             :         /* Timeline 1 does not have a history file, all else should */
    1185           0 :         if (rtli != 1 && !existsTimeLineHistory(rtli))
    1186           0 :             ereport(FATAL,
    1187             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1188             :                      errmsg("recovery target timeline %u does not exist",
    1189             :                             rtli)));
    1190           0 :         recoveryTargetTLI = rtli;
    1191             :     }
    1192         234 :     else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
    1193             :     {
    1194             :         /* We start the "latest" search from pg_control's timeline */
    1195         234 :         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
    1196             :     }
    1197             :     else
    1198             :     {
    1199             :         /*
    1200             :          * else we just use the recoveryTargetTLI as already read from
    1201             :          * ControlFile
    1202             :          */
    1203             :         Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
    1204             :     }
    1205             : }
    1206             : 
    1207             : /*
    1208             :  * read_backup_label: check to see if a backup_label file is present
    1209             :  *
    1210             :  * If we see a backup_label during recovery, we assume that we are recovering
    1211             :  * from a backup dump file, and we therefore roll forward from the checkpoint
    1212             :  * identified by the label file, NOT what pg_control says.  This avoids the
    1213             :  * problem that pg_control might have been archived one or more checkpoints
    1214             :  * later than the start of the dump, and so if we rely on it as the start
    1215             :  * point, we will fail to restore a consistent database state.
    1216             :  *
    1217             :  * Returns true if a backup_label was found (and fills the checkpoint
    1218             :  * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
    1219             :  * returns false if not. If this backup_label came from a streamed backup,
    1220             :  * *backupEndRequired is set to true. If this backup_label was created during
    1221             :  * recovery, *backupFromStandby is set to true.
    1222             :  *
    1223             :  * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
    1224             :  * and TLI read from the backup file.
    1225             :  */
    1226             : static bool
    1227        1984 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
    1228             :                   bool *backupEndRequired, bool *backupFromStandby)
    1229             : {
    1230             :     char        startxlogfilename[MAXFNAMELEN];
    1231             :     TimeLineID  tli_from_walseg,
    1232             :                 tli_from_file;
    1233             :     FILE       *lfp;
    1234             :     char        ch;
    1235             :     char        backuptype[20];
    1236             :     char        backupfrom[20];
    1237             :     char        backuplabel[MAXPGPATH];
    1238             :     char        backuptime[128];
    1239             :     uint32      hi,
    1240             :                 lo;
    1241             : 
    1242             :     /* suppress possible uninitialized-variable warnings */
    1243        1984 :     *checkPointLoc = InvalidXLogRecPtr;
    1244        1984 :     *backupLabelTLI = 0;
    1245        1984 :     *backupEndRequired = false;
    1246        1984 :     *backupFromStandby = false;
    1247             : 
    1248             :     /*
    1249             :      * See if label file is present
    1250             :      */
    1251        1984 :     lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
    1252        1984 :     if (!lfp)
    1253             :     {
    1254        1826 :         if (errno != ENOENT)
    1255           0 :             ereport(FATAL,
    1256             :                     (errcode_for_file_access(),
    1257             :                      errmsg("could not read file \"%s\": %m",
    1258             :                             BACKUP_LABEL_FILE)));
    1259        1826 :         return false;           /* it's not there, all is fine */
    1260             :     }
    1261             : 
    1262             :     /*
    1263             :      * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
    1264             :      * is pretty crude, but we are not expecting any variability in the file
    1265             :      * format).
    1266             :      */
    1267         158 :     if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
    1268         158 :                &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
    1269           0 :         ereport(FATAL,
    1270             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1271             :                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    1272         158 :     RedoStartLSN = ((uint64) hi) << 32 | lo;
    1273         158 :     RedoStartTLI = tli_from_walseg;
    1274         158 :     if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
    1275         158 :                &hi, &lo, &ch) != 3 || ch != '\n')
    1276           0 :         ereport(FATAL,
    1277             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1278             :                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    1279         158 :     *checkPointLoc = ((uint64) hi) << 32 | lo;
    1280         158 :     *backupLabelTLI = tli_from_walseg;
    1281             : 
    1282             :     /*
    1283             :      * BACKUP METHOD lets us know if this was a typical backup ("streamed",
    1284             :      * which could mean either pg_basebackup or the pg_backup_start/stop
    1285             :      * method was used) or if this label came from somewhere else (the only
    1286             :      * other option today being from pg_rewind).  If this was a streamed
    1287             :      * backup then we know that we need to play through until we get to the
    1288             :      * end of the WAL which was generated during the backup (at which point we
    1289             :      * will have reached consistency and backupEndRequired will be reset to be
    1290             :      * false).
    1291             :      */
    1292         158 :     if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
    1293             :     {
    1294         158 :         if (strcmp(backuptype, "streamed") == 0)
    1295         156 :             *backupEndRequired = true;
    1296             :     }
    1297             : 
    1298             :     /*
    1299             :      * BACKUP FROM lets us know if this was from a primary or a standby.  If
    1300             :      * it was from a standby, we'll double-check that the control file state
    1301             :      * matches that of a standby.
    1302             :      */
    1303         158 :     if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
    1304             :     {
    1305         158 :         if (strcmp(backupfrom, "standby") == 0)
    1306          10 :             *backupFromStandby = true;
    1307             :     }
    1308             : 
    1309             :     /*
    1310             :      * Parse START TIME and LABEL. Those are not mandatory fields for recovery
    1311             :      * but checking for their presence is useful for debugging and the next
    1312             :      * sanity checks. Cope also with the fact that the result buffers have a
    1313             :      * pre-allocated size, hence if the backup_label file has been generated
    1314             :      * with strings longer than the maximum assumed here an incorrect parsing
    1315             :      * happens. That's fine as only minor consistency checks are done
    1316             :      * afterwards.
    1317             :      */
    1318         158 :     if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
    1319         158 :         ereport(DEBUG1,
    1320             :                 (errmsg_internal("backup time %s in file \"%s\"",
    1321             :                                  backuptime, BACKUP_LABEL_FILE)));
    1322             : 
    1323         158 :     if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
    1324         156 :         ereport(DEBUG1,
    1325             :                 (errmsg_internal("backup label %s in file \"%s\"",
    1326             :                                  backuplabel, BACKUP_LABEL_FILE)));
    1327             : 
    1328             :     /*
    1329             :      * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
    1330             :      * it as a sanity check if present.
    1331             :      */
    1332         158 :     if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
    1333             :     {
    1334         156 :         if (tli_from_walseg != tli_from_file)
    1335           0 :             ereport(FATAL,
    1336             :                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1337             :                      errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
    1338             :                      errdetail("Timeline ID parsed is %u, but expected %u.",
    1339             :                                tli_from_file, tli_from_walseg)));
    1340             : 
    1341         156 :         ereport(DEBUG1,
    1342             :                 (errmsg_internal("backup timeline %u in file \"%s\"",
    1343             :                                  tli_from_file, BACKUP_LABEL_FILE)));
    1344             :     }
    1345             : 
    1346         158 :     if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
    1347           0 :         ereport(FATAL,
    1348             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1349             :                  errmsg("this is an incremental backup, not a data directory"),
    1350             :                  errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
    1351             : 
    1352         158 :     if (ferror(lfp) || FreeFile(lfp))
    1353           0 :         ereport(FATAL,
    1354             :                 (errcode_for_file_access(),
    1355             :                  errmsg("could not read file \"%s\": %m",
    1356             :                         BACKUP_LABEL_FILE)));
    1357             : 
    1358         158 :     return true;
    1359             : }
    1360             : 
    1361             : /*
    1362             :  * read_tablespace_map: check to see if a tablespace_map file is present
    1363             :  *
    1364             :  * If we see a tablespace_map file during recovery, we assume that we are
    1365             :  * recovering from a backup dump file, and we therefore need to create symlinks
    1366             :  * as per the information present in tablespace_map file.
    1367             :  *
    1368             :  * Returns true if a tablespace_map file was found (and fills *tablespaces
    1369             :  * with a tablespaceinfo struct for each tablespace listed in the file);
    1370             :  * returns false if not.
    1371             :  */
    1372             : static bool
    1373         158 : read_tablespace_map(List **tablespaces)
    1374             : {
    1375             :     tablespaceinfo *ti;
    1376             :     FILE       *lfp;
    1377             :     char        str[MAXPGPATH];
    1378             :     int         ch,
    1379             :                 i,
    1380             :                 n;
    1381             :     bool        was_backslash;
    1382             : 
    1383             :     /*
    1384             :      * See if tablespace_map file is present
    1385             :      */
    1386         158 :     lfp = AllocateFile(TABLESPACE_MAP, "r");
    1387         158 :     if (!lfp)
    1388             :     {
    1389         154 :         if (errno != ENOENT)
    1390           0 :             ereport(FATAL,
    1391             :                     (errcode_for_file_access(),
    1392             :                      errmsg("could not read file \"%s\": %m",
    1393             :                             TABLESPACE_MAP)));
    1394         154 :         return false;           /* it's not there, all is fine */
    1395             :     }
    1396             : 
    1397             :     /*
    1398             :      * Read and parse the link name and path lines from tablespace_map file
    1399             :      * (this code is pretty crude, but we are not expecting any variability in
    1400             :      * the file format).  De-escape any backslashes that were inserted.
    1401             :      */
    1402           4 :     i = 0;
    1403           4 :     was_backslash = false;
    1404         154 :     while ((ch = fgetc(lfp)) != EOF)
    1405             :     {
    1406         150 :         if (!was_backslash && (ch == '\n' || ch == '\r'))
    1407           4 :         {
    1408             :             char       *endp;
    1409             : 
    1410           4 :             if (i == 0)
    1411           0 :                 continue;       /* \r immediately followed by \n */
    1412             : 
    1413             :             /*
    1414             :              * The de-escaped line should contain an OID followed by exactly
    1415             :              * one space followed by a path.  The path might start with
    1416             :              * spaces, so don't be too liberal about parsing.
    1417             :              */
    1418           4 :             str[i] = '\0';
    1419           4 :             n = 0;
    1420          24 :             while (str[n] && str[n] != ' ')
    1421          20 :                 n++;
    1422           4 :             if (n < 1 || n >= i - 1)
    1423           0 :                 ereport(FATAL,
    1424             :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1425             :                          errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
    1426           4 :             str[n++] = '\0';
    1427             : 
    1428           4 :             ti = palloc0_object(tablespaceinfo);
    1429           4 :             errno = 0;
    1430           4 :             ti->oid = strtoul(str, &endp, 10);
    1431           4 :             if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
    1432           0 :                 ereport(FATAL,
    1433             :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1434             :                          errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
    1435           4 :             ti->path = pstrdup(str + n);
    1436           4 :             *tablespaces = lappend(*tablespaces, ti);
    1437             : 
    1438           4 :             i = 0;
    1439           4 :             continue;
    1440             :         }
    1441         146 :         else if (!was_backslash && ch == '\\')
    1442           0 :             was_backslash = true;
    1443             :         else
    1444             :         {
    1445         146 :             if (i < sizeof(str) - 1)
    1446         146 :                 str[i++] = ch;
    1447         146 :             was_backslash = false;
    1448             :         }
    1449             :     }
    1450             : 
    1451           4 :     if (i != 0 || was_backslash)    /* last line not terminated? */
    1452           0 :         ereport(FATAL,
    1453             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1454             :                  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
    1455             : 
    1456           4 :     if (ferror(lfp) || FreeFile(lfp))
    1457           0 :         ereport(FATAL,
    1458             :                 (errcode_for_file_access(),
    1459             :                  errmsg("could not read file \"%s\": %m",
    1460             :                         TABLESPACE_MAP)));
    1461             : 
    1462           4 :     return true;
    1463             : }
    1464             : 
    1465             : /*
    1466             :  * Finish WAL recovery.
    1467             :  *
    1468             :  * This does not close the 'xlogreader' yet, because in some cases the caller
    1469             :  * still wants to re-read the last checkpoint record by calling
    1470             :  * ReadCheckpointRecord().
    1471             :  *
    1472             :  * Returns the position of the last valid or applied record, after which new
    1473             :  * WAL should be appended, information about why recovery was ended, and some
    1474             :  * other things. See the EndOfWalRecoveryInfo struct for details.
    1475             :  */
    1476             : EndOfWalRecoveryInfo *
    1477        1854 : FinishWalRecovery(void)
    1478             : {
    1479        1854 :     EndOfWalRecoveryInfo *result = palloc_object(EndOfWalRecoveryInfo);
    1480             :     XLogRecPtr  lastRec;
    1481             :     TimeLineID  lastRecTLI;
    1482             :     XLogRecPtr  endOfLog;
    1483             : 
    1484             :     /*
    1485             :      * Kill WAL receiver, if it's still running, before we continue to write
    1486             :      * the startup checkpoint and aborted-contrecord records. It will trump
    1487             :      * over these records and subsequent ones if it's still alive when we
    1488             :      * start writing WAL.
    1489             :      */
    1490        1854 :     XLogShutdownWalRcv();
    1491             : 
    1492             :     /*
    1493             :      * Shutdown the slot sync worker to drop any temporary slots acquired by
    1494             :      * it and to prevent it from keep trying to fetch the failover slots.
    1495             :      *
    1496             :      * We do not update the 'synced' column in 'pg_replication_slots' system
    1497             :      * view from true to false here, as any failed update could leave 'synced'
    1498             :      * column false for some slots. This could cause issues during slot sync
    1499             :      * after restarting the server as a standby. While updating the 'synced'
    1500             :      * column after switching to the new timeline is an option, it does not
    1501             :      * simplify the handling for the 'synced' column. Therefore, we retain the
    1502             :      * 'synced' column as true after promotion as it may provide useful
    1503             :      * information about the slot origin.
    1504             :      */
    1505        1854 :     ShutDownSlotSync();
    1506             : 
    1507             :     /*
    1508             :      * We are now done reading the xlog from stream. Turn off streaming
    1509             :      * recovery to force fetching the files (which would be required at end of
    1510             :      * recovery, e.g., timeline history file) from archive or pg_wal.
    1511             :      *
    1512             :      * Note that standby mode must be turned off after killing WAL receiver,
    1513             :      * i.e., calling XLogShutdownWalRcv().
    1514             :      */
    1515             :     Assert(!WalRcvStreaming());
    1516        1854 :     StandbyMode = false;
    1517             : 
    1518             :     /*
    1519             :      * Determine where to start writing WAL next.
    1520             :      *
    1521             :      * Re-fetch the last valid or last applied record, so we can identify the
    1522             :      * exact endpoint of what we consider the valid portion of WAL.  There may
    1523             :      * be an incomplete continuation record after that, in which case
    1524             :      * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
    1525             :      * write a special OVERWRITE_CONTRECORD message to mark that the rest of
    1526             :      * it is intentionally missing.  See CreateOverwriteContrecordRecord().
    1527             :      *
    1528             :      * An important side-effect of this is to load the last page into
    1529             :      * xlogreader. The caller uses it to initialize the WAL for writing.
    1530             :      */
    1531        1854 :     if (!InRecovery)
    1532             :     {
    1533        1532 :         lastRec = CheckPointLoc;
    1534        1532 :         lastRecTLI = CheckPointTLI;
    1535             :     }
    1536             :     else
    1537             :     {
    1538         322 :         lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
    1539         322 :         lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
    1540             :     }
    1541        1854 :     XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
    1542        1854 :     (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
    1543        1854 :     endOfLog = xlogreader->EndRecPtr;
    1544             : 
    1545             :     /*
    1546             :      * Remember the TLI in the filename of the XLOG segment containing the
    1547             :      * end-of-log.  It could be different from the timeline that endOfLog
    1548             :      * nominally belongs to, if there was a timeline switch in that segment,
    1549             :      * and we were reading the old WAL from a segment belonging to a higher
    1550             :      * timeline.
    1551             :      */
    1552        1854 :     result->endOfLogTLI = xlogreader->seg.ws_tli;
    1553             : 
    1554        1854 :     if (ArchiveRecoveryRequested)
    1555             :     {
    1556             :         /*
    1557             :          * We are no longer in archive recovery state.
    1558             :          *
    1559             :          * We are now done reading the old WAL.  Turn off archive fetching if
    1560             :          * it was active.
    1561             :          */
    1562             :         Assert(InArchiveRecovery);
    1563         108 :         InArchiveRecovery = false;
    1564             : 
    1565             :         /*
    1566             :          * If the ending log segment is still open, close it (to avoid
    1567             :          * problems on Windows with trying to rename or delete an open file).
    1568             :          */
    1569         108 :         if (readFile >= 0)
    1570             :         {
    1571         108 :             close(readFile);
    1572         108 :             readFile = -1;
    1573             :         }
    1574             :     }
    1575             : 
    1576             :     /*
    1577             :      * Copy the last partial block to the caller, for initializing the WAL
    1578             :      * buffer for appending new WAL.
    1579             :      */
    1580        1854 :     if (endOfLog % XLOG_BLCKSZ != 0)
    1581             :     {
    1582             :         char       *page;
    1583             :         int         len;
    1584             :         XLogRecPtr  pageBeginPtr;
    1585             : 
    1586        1812 :         pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
    1587             :         Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
    1588             : 
    1589             :         /* Copy the valid part of the last block */
    1590        1812 :         len = endOfLog % XLOG_BLCKSZ;
    1591        1812 :         page = palloc(len);
    1592        1812 :         memcpy(page, xlogreader->readBuf, len);
    1593             : 
    1594        1812 :         result->lastPageBeginPtr = pageBeginPtr;
    1595        1812 :         result->lastPage = page;
    1596             :     }
    1597             :     else
    1598             :     {
    1599             :         /* There is no partial block to copy. */
    1600          42 :         result->lastPageBeginPtr = endOfLog;
    1601          42 :         result->lastPage = NULL;
    1602             :     }
    1603             : 
    1604             :     /*
    1605             :      * Create a comment for the history file to explain why and where timeline
    1606             :      * changed.
    1607             :      */
    1608        1854 :     result->recoveryStopReason = getRecoveryStopReason();
    1609             : 
    1610        1854 :     result->lastRec = lastRec;
    1611        1854 :     result->lastRecTLI = lastRecTLI;
    1612        1854 :     result->endOfLog = endOfLog;
    1613             : 
    1614        1854 :     result->abortedRecPtr = abortedRecPtr;
    1615        1854 :     result->missingContrecPtr = missingContrecPtr;
    1616             : 
    1617        1854 :     result->standby_signal_file_found = standby_signal_file_found;
    1618        1854 :     result->recovery_signal_file_found = recovery_signal_file_found;
    1619             : 
    1620        1854 :     return result;
    1621             : }
    1622             : 
    1623             : /*
    1624             :  * Clean up the WAL reader and leftovers from restoring WAL from archive
    1625             :  */
    1626             : void
    1627        1854 : ShutdownWalRecovery(void)
    1628             : {
    1629             :     char        recoveryPath[MAXPGPATH];
    1630             : 
    1631             :     /* Final update of pg_stat_recovery_prefetch. */
    1632        1854 :     XLogPrefetcherComputeStats(xlogprefetcher);
    1633             : 
    1634             :     /* Shut down xlogreader */
    1635        1854 :     if (readFile >= 0)
    1636             :     {
    1637        1746 :         close(readFile);
    1638        1746 :         readFile = -1;
    1639             :     }
    1640        1854 :     pfree(xlogreader->private_data);
    1641        1854 :     XLogReaderFree(xlogreader);
    1642        1854 :     XLogPrefetcherFree(xlogprefetcher);
    1643             : 
    1644        1854 :     if (ArchiveRecoveryRequested)
    1645             :     {
    1646             :         /*
    1647             :          * Since there might be a partial WAL segment named RECOVERYXLOG, get
    1648             :          * rid of it.
    1649             :          */
    1650         108 :         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
    1651         108 :         unlink(recoveryPath);   /* ignore any error */
    1652             : 
    1653             :         /* Get rid of any remaining recovered timeline-history file, too */
    1654         108 :         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
    1655         108 :         unlink(recoveryPath);   /* ignore any error */
    1656             :     }
    1657             : 
    1658             :     /*
    1659             :      * We don't need the latch anymore. It's not strictly necessary to disown
    1660             :      * it, but let's do it for the sake of tidiness.
    1661             :      */
    1662        1854 :     if (ArchiveRecoveryRequested)
    1663         108 :         DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
    1664        1854 : }
    1665             : 
    1666             : /*
    1667             :  * Perform WAL recovery.
    1668             :  *
    1669             :  * If the system was shut down cleanly, this is never called.
    1670             :  */
    1671             : void
    1672         446 : PerformWalRecovery(void)
    1673             : {
    1674             :     XLogRecord *record;
    1675         446 :     bool        reachedRecoveryTarget = false;
    1676             :     TimeLineID  replayTLI;
    1677             : 
    1678             :     /*
    1679             :      * Initialize shared variables for tracking progress of WAL replay, as if
    1680             :      * we had just replayed the record before the REDO location (or the
    1681             :      * checkpoint record itself, if it's a shutdown checkpoint).
    1682             :      */
    1683         446 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    1684         446 :     if (RedoStartLSN < CheckPointLoc)
    1685             :     {
    1686         242 :         XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
    1687         242 :         XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
    1688         242 :         XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
    1689             :     }
    1690             :     else
    1691             :     {
    1692         204 :         XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
    1693         204 :         XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
    1694         204 :         XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
    1695             :     }
    1696         446 :     XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
    1697         446 :     XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
    1698         446 :     XLogRecoveryCtl->recoveryLastXTime = 0;
    1699         446 :     XLogRecoveryCtl->currentChunkStartTime = 0;
    1700         446 :     XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
    1701         446 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    1702             : 
    1703             :     /* Also ensure XLogReceiptTime has a sane value */
    1704         446 :     XLogReceiptTime = GetCurrentTimestamp();
    1705             : 
    1706             :     /*
    1707             :      * Let postmaster know we've started redo now, so that it can launch the
    1708             :      * archiver if necessary.
    1709             :      */
    1710         446 :     if (IsUnderPostmaster)
    1711         428 :         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
    1712             : 
    1713             :     /*
    1714             :      * Allow read-only connections immediately if we're consistent already.
    1715             :      */
    1716         446 :     CheckRecoveryConsistency();
    1717             : 
    1718             :     /*
    1719             :      * Find the first record that logically follows the checkpoint --- it
    1720             :      * might physically precede it, though.
    1721             :      */
    1722         446 :     if (RedoStartLSN < CheckPointLoc)
    1723             :     {
    1724             :         /* back up to find the record */
    1725         242 :         replayTLI = RedoStartTLI;
    1726         242 :         XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
    1727         242 :         record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
    1728             : 
    1729             :         /*
    1730             :          * If a checkpoint record's redo pointer points back to an earlier
    1731             :          * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
    1732             :          * record.
    1733             :          */
    1734         242 :         if (record->xl_rmid != RM_XLOG_ID ||
    1735         242 :             (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
    1736           0 :             ereport(FATAL,
    1737             :                     errmsg("unexpected record type found at redo point %X/%08X",
    1738             :                            LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
    1739             :     }
    1740             :     else
    1741             :     {
    1742             :         /* just have to read next record after CheckPoint */
    1743             :         Assert(xlogreader->ReadRecPtr == CheckPointLoc);
    1744         204 :         replayTLI = CheckPointTLI;
    1745         204 :         record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
    1746             :     }
    1747             : 
    1748         446 :     if (record != NULL)
    1749             :     {
    1750             :         TimestampTz xtime;
    1751             :         PGRUsage    ru0;
    1752             : 
    1753         428 :         pg_rusage_init(&ru0);
    1754             : 
    1755         428 :         InRedo = true;
    1756             : 
    1757         428 :         RmgrStartup();
    1758             : 
    1759         428 :         ereport(LOG,
    1760             :                 errmsg("redo starts at %X/%08X",
    1761             :                        LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
    1762             : 
    1763             :         /* Prepare to report progress of the redo phase. */
    1764         428 :         if (!StandbyMode)
    1765         216 :             begin_startup_progress_phase();
    1766             : 
    1767             :         /*
    1768             :          * main redo apply loop
    1769             :          */
    1770             :         do
    1771             :         {
    1772     5537712 :             if (!StandbyMode)
    1773      527900 :                 ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
    1774             :                                          LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
    1775             : 
    1776             : #ifdef WAL_DEBUG
    1777             :             if (XLOG_DEBUG)
    1778             :             {
    1779             :                 StringInfoData buf;
    1780             : 
    1781             :                 initStringInfo(&buf);
    1782             :                 appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
    1783             :                                  LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
    1784             :                                  LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
    1785             :                 xlog_outrec(&buf, xlogreader);
    1786             :                 appendStringInfoString(&buf, " - ");
    1787             :                 xlog_outdesc(&buf, xlogreader);
    1788             :                 elog(LOG, "%s", buf.data);
    1789             :                 pfree(buf.data);
    1790             :             }
    1791             : #endif
    1792             : 
    1793             :             /* Handle interrupt signals of startup process */
    1794     5537712 :             ProcessStartupProcInterrupts();
    1795             : 
    1796             :             /*
    1797             :              * Pause WAL replay, if requested by a hot-standby session via
    1798             :              * SetRecoveryPause().
    1799             :              *
    1800             :              * Note that we intentionally don't take the info_lck spinlock
    1801             :              * here.  We might therefore read a slightly stale value of the
    1802             :              * recoveryPause flag, but it can't be very stale (no worse than
    1803             :              * the last spinlock we did acquire).  Since a pause request is a
    1804             :              * pretty asynchronous thing anyway, possibly responding to it one
    1805             :              * WAL record later than we otherwise would is a minor issue, so
    1806             :              * it doesn't seem worth adding another spinlock cycle to prevent
    1807             :              * that.
    1808             :              */
    1809     5537712 :             if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
    1810             :                 RECOVERY_NOT_PAUSED)
    1811           0 :                 recoveryPausesHere(false);
    1812             : 
    1813             :             /*
    1814             :              * Have we reached our recovery target?
    1815             :              */
    1816     5537712 :             if (recoveryStopsBefore(xlogreader))
    1817             :             {
    1818           4 :                 reachedRecoveryTarget = true;
    1819           4 :                 break;
    1820             :             }
    1821             : 
    1822             :             /*
    1823             :              * If we've been asked to lag the primary, wait on latch until
    1824             :              * enough time has passed.
    1825             :              */
    1826     5537708 :             if (recoveryApplyDelay(xlogreader))
    1827             :             {
    1828             :                 /*
    1829             :                  * We test for paused recovery again here. If user sets
    1830             :                  * delayed apply, it may be because they expect to pause
    1831             :                  * recovery in case of problems, so we must test again here
    1832             :                  * otherwise pausing during the delay-wait wouldn't work.
    1833             :                  */
    1834          56 :                 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
    1835             :                     RECOVERY_NOT_PAUSED)
    1836           0 :                     recoveryPausesHere(false);
    1837             :             }
    1838             : 
    1839             :             /*
    1840             :              * Apply the record
    1841             :              */
    1842     5537708 :             ApplyWalRecord(xlogreader, record, &replayTLI);
    1843             : 
    1844             :             /* Exit loop if we reached inclusive recovery target */
    1845     5537704 :             if (recoveryStopsAfter(xlogreader))
    1846             :             {
    1847          10 :                 reachedRecoveryTarget = true;
    1848          10 :                 break;
    1849             :             }
    1850             : 
    1851             :             /*
    1852             :              * If we replayed an LSN that someone was waiting for then walk
    1853             :              * over the shared memory array and set latches to notify the
    1854             :              * waiters.
    1855             :              */
    1856    11075388 :             if (waitLSNState &&
    1857     5537694 :                 (XLogRecoveryCtl->lastReplayedEndRecPtr >=
    1858     5537694 :                  pg_atomic_read_u64(&waitLSNState->minWaitedLSN[WAIT_LSN_TYPE_STANDBY_REPLAY])))
    1859          16 :                 WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_REPLAY, XLogRecoveryCtl->lastReplayedEndRecPtr);
    1860             : 
    1861             :             /* Else, try to fetch the next WAL record */
    1862     5537694 :             record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
    1863     5537576 :         } while (record != NULL);
    1864             : 
    1865             :         /*
    1866             :          * end of main redo apply loop
    1867             :          */
    1868             : 
    1869         306 :         if (reachedRecoveryTarget)
    1870             :         {
    1871          14 :             if (!reachedConsistency)
    1872           0 :                 ereport(FATAL,
    1873             :                         (errmsg("requested recovery stop point is before consistent recovery point")));
    1874             : 
    1875             :             /*
    1876             :              * This is the last point where we can restart recovery with a new
    1877             :              * recovery target, if we shutdown and begin again. After this,
    1878             :              * Resource Managers may choose to do permanent corrective actions
    1879             :              * at end of recovery.
    1880             :              */
    1881          14 :             switch (recoveryTargetAction)
    1882             :             {
    1883           0 :                 case RECOVERY_TARGET_ACTION_SHUTDOWN:
    1884             : 
    1885             :                     /*
    1886             :                      * exit with special return code to request shutdown of
    1887             :                      * postmaster.  Log messages issued from postmaster.
    1888             :                      */
    1889           0 :                     proc_exit(3);
    1890             : 
    1891           2 :                 case RECOVERY_TARGET_ACTION_PAUSE:
    1892           2 :                     SetRecoveryPause(true);
    1893           2 :                     recoveryPausesHere(true);
    1894             : 
    1895             :                     /* drop into promote */
    1896             : 
    1897          14 :                 case RECOVERY_TARGET_ACTION_PROMOTE:
    1898          14 :                     break;
    1899             :             }
    1900             :         }
    1901             : 
    1902         306 :         RmgrCleanup();
    1903             : 
    1904         306 :         ereport(LOG,
    1905             :                 errmsg("redo done at %X/%08X system usage: %s",
    1906             :                        LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
    1907             :                        pg_rusage_show(&ru0)));
    1908         306 :         xtime = GetLatestXTime();
    1909         306 :         if (xtime)
    1910          74 :             ereport(LOG,
    1911             :                     (errmsg("last completed transaction was at log time %s",
    1912             :                             timestamptz_to_str(xtime))));
    1913             : 
    1914         306 :         InRedo = false;
    1915             :     }
    1916             :     else
    1917             :     {
    1918             :         /* there are no WAL records following the checkpoint */
    1919          18 :         ereport(LOG,
    1920             :                 (errmsg("redo is not required")));
    1921             :     }
    1922             : 
    1923             :     /*
    1924             :      * This check is intentionally after the above log messages that indicate
    1925             :      * how far recovery went.
    1926             :      */
    1927         324 :     if (ArchiveRecoveryRequested &&
    1928         110 :         recoveryTarget != RECOVERY_TARGET_UNSET &&
    1929          16 :         !reachedRecoveryTarget)
    1930           2 :         ereport(FATAL,
    1931             :                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
    1932             :                  errmsg("recovery ended before configured recovery target was reached")));
    1933         322 : }
    1934             : 
    1935             : /*
    1936             :  * Subroutine of PerformWalRecovery, to apply one WAL record.
    1937             :  */
    1938             : static void
    1939     5537708 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
    1940             : {
    1941             :     ErrorContextCallback errcallback;
    1942     5537708 :     bool        switchedTLI = false;
    1943             : 
    1944             :     /* Setup error traceback support for ereport() */
    1945     5537708 :     errcallback.callback = rm_redo_error_callback;
    1946     5537708 :     errcallback.arg = xlogreader;
    1947     5537708 :     errcallback.previous = error_context_stack;
    1948     5537708 :     error_context_stack = &errcallback;
    1949             : 
    1950             :     /*
    1951             :      * TransamVariables->nextXid must be beyond record's xid.
    1952             :      */
    1953     5537708 :     AdvanceNextFullTransactionIdPastXid(record->xl_xid);
    1954             : 
    1955             :     /*
    1956             :      * Before replaying this record, check if this record causes the current
    1957             :      * timeline to change. The record is already considered to be part of the
    1958             :      * new timeline, so we update replayTLI before replaying it. That's
    1959             :      * important so that replayEndTLI, which is recorded as the minimum
    1960             :      * recovery point's TLI if recovery stops after this record, is set
    1961             :      * correctly.
    1962             :      */
    1963     5537708 :     if (record->xl_rmid == RM_XLOG_ID)
    1964             :     {
    1965       87538 :         TimeLineID  newReplayTLI = *replayTLI;
    1966       87538 :         TimeLineID  prevReplayTLI = *replayTLI;
    1967       87538 :         uint8       info = record->xl_info & ~XLR_INFO_MASK;
    1968             : 
    1969       87538 :         if (info == XLOG_CHECKPOINT_SHUTDOWN)
    1970             :         {
    1971             :             CheckPoint  checkPoint;
    1972             : 
    1973          78 :             memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
    1974          78 :             newReplayTLI = checkPoint.ThisTimeLineID;
    1975          78 :             prevReplayTLI = checkPoint.PrevTimeLineID;
    1976             :         }
    1977       87460 :         else if (info == XLOG_END_OF_RECOVERY)
    1978             :         {
    1979             :             xl_end_of_recovery xlrec;
    1980             : 
    1981          22 :             memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
    1982          22 :             newReplayTLI = xlrec.ThisTimeLineID;
    1983          22 :             prevReplayTLI = xlrec.PrevTimeLineID;
    1984             :         }
    1985             : 
    1986       87538 :         if (newReplayTLI != *replayTLI)
    1987             :         {
    1988             :             /* Check that it's OK to switch to this TLI */
    1989          24 :             checkTimeLineSwitch(xlogreader->EndRecPtr,
    1990             :                                 newReplayTLI, prevReplayTLI, *replayTLI);
    1991             : 
    1992             :             /* Following WAL records should be run with new TLI */
    1993          24 :             *replayTLI = newReplayTLI;
    1994          24 :             switchedTLI = true;
    1995             :         }
    1996             :     }
    1997             : 
    1998             :     /*
    1999             :      * Update shared replayEndRecPtr before replaying this record, so that
    2000             :      * XLogFlush will update minRecoveryPoint correctly.
    2001             :      */
    2002     5537708 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    2003     5537708 :     XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
    2004     5537708 :     XLogRecoveryCtl->replayEndTLI = *replayTLI;
    2005     5537708 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    2006             : 
    2007             :     /*
    2008             :      * If we are attempting to enter Hot Standby mode, process XIDs we see
    2009             :      */
    2010     5537708 :     if (standbyState >= STANDBY_INITIALIZED &&
    2011     5049598 :         TransactionIdIsValid(record->xl_xid))
    2012     4947146 :         RecordKnownAssignedTransactionIds(record->xl_xid);
    2013             : 
    2014             :     /*
    2015             :      * Some XLOG record types that are related to recovery are processed
    2016             :      * directly here, rather than in xlog_redo()
    2017             :      */
    2018     5537708 :     if (record->xl_rmid == RM_XLOG_ID)
    2019       87538 :         xlogrecovery_redo(xlogreader, *replayTLI);
    2020             : 
    2021             :     /* Now apply the WAL record itself */
    2022     5537708 :     GetRmgr(record->xl_rmid).rm_redo(xlogreader);
    2023             : 
    2024             :     /*
    2025             :      * After redo, check whether the backup pages associated with the WAL
    2026             :      * record are consistent with the existing pages. This check is done only
    2027             :      * if consistency check is enabled for this record.
    2028             :      */
    2029     5537704 :     if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
    2030     4408750 :         verifyBackupPageConsistency(xlogreader);
    2031             : 
    2032             :     /* Pop the error context stack */
    2033     5537704 :     error_context_stack = errcallback.previous;
    2034             : 
    2035             :     /*
    2036             :      * Update lastReplayedEndRecPtr after this record has been successfully
    2037             :      * replayed.
    2038             :      */
    2039     5537704 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    2040     5537704 :     XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
    2041     5537704 :     XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
    2042     5537704 :     XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
    2043     5537704 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    2044             : 
    2045             :     /* ------
    2046             :      * Wakeup walsenders:
    2047             :      *
    2048             :      * On the standby, the WAL is flushed first (which will only wake up
    2049             :      * physical walsenders) and then applied, which will only wake up logical
    2050             :      * walsenders.
    2051             :      *
    2052             :      * Indeed, logical walsenders on standby can't decode and send data until
    2053             :      * it's been applied.
    2054             :      *
    2055             :      * Physical walsenders don't need to be woken up during replay unless
    2056             :      * cascading replication is allowed and time line change occurred (so that
    2057             :      * they can notice that they are on a new time line).
    2058             :      *
    2059             :      * That's why the wake up conditions are for:
    2060             :      *
    2061             :      *  - physical walsenders in case of new time line and cascade
    2062             :      *    replication is allowed
    2063             :      *  - logical walsenders in case cascade replication is allowed (could not
    2064             :      *    be created otherwise)
    2065             :      * ------
    2066             :      */
    2067     5537704 :     if (AllowCascadeReplication())
    2068     5158888 :         WalSndWakeup(switchedTLI, true);
    2069             : 
    2070             :     /*
    2071             :      * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
    2072             :      * receiver so that it notices the updated lastReplayedEndRecPtr and sends
    2073             :      * a reply to the primary.
    2074             :      */
    2075     5537704 :     if (doRequestWalReceiverReply)
    2076             :     {
    2077           4 :         doRequestWalReceiverReply = false;
    2078           4 :         WalRcvForceReply();
    2079             :     }
    2080             : 
    2081             :     /* Allow read-only connections if we're consistent now */
    2082     5537704 :     CheckRecoveryConsistency();
    2083             : 
    2084             :     /* Is this a timeline switch? */
    2085     5537704 :     if (switchedTLI)
    2086             :     {
    2087             :         /*
    2088             :          * Before we continue on the new timeline, clean up any (possibly
    2089             :          * bogus) future WAL segments on the old timeline.
    2090             :          */
    2091          24 :         RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
    2092             : 
    2093             :         /* Reset the prefetcher. */
    2094          24 :         XLogPrefetchReconfigure();
    2095             :     }
    2096     5537704 : }
    2097             : 
    2098             : /*
    2099             :  * Some XLOG RM record types that are directly related to WAL recovery are
    2100             :  * handled here rather than in the xlog_redo()
    2101             :  */
    2102             : static void
    2103       87538 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
    2104             : {
    2105       87538 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    2106       87538 :     XLogRecPtr  lsn = record->EndRecPtr;
    2107             : 
    2108             :     Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
    2109             : 
    2110       87538 :     if (info == XLOG_OVERWRITE_CONTRECORD)
    2111             :     {
    2112             :         /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
    2113             :         xl_overwrite_contrecord xlrec;
    2114             : 
    2115           2 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
    2116           2 :         if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
    2117           0 :             elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
    2118             :                  LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
    2119             :                  LSN_FORMAT_ARGS(record->overwrittenRecPtr));
    2120             : 
    2121             :         /* We have safely skipped the aborted record */
    2122           2 :         abortedRecPtr = InvalidXLogRecPtr;
    2123           2 :         missingContrecPtr = InvalidXLogRecPtr;
    2124             : 
    2125           2 :         ereport(LOG,
    2126             :                 errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
    2127             :                        LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
    2128             :                        timestamptz_to_str(xlrec.overwrite_time)));
    2129             : 
    2130             :         /* Verifying the record should only happen once */
    2131           2 :         record->overwrittenRecPtr = InvalidXLogRecPtr;
    2132             :     }
    2133       87536 :     else if (info == XLOG_BACKUP_END)
    2134             :     {
    2135             :         XLogRecPtr  startpoint;
    2136             : 
    2137         188 :         memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
    2138             : 
    2139         188 :         if (backupStartPoint == startpoint)
    2140             :         {
    2141             :             /*
    2142             :              * We have reached the end of base backup, the point where
    2143             :              * pg_backup_stop() was done.  The data on disk is now consistent
    2144             :              * (assuming we have also reached minRecoveryPoint).  Set
    2145             :              * backupEndPoint to the current LSN, so that the next call to
    2146             :              * CheckRecoveryConsistency() will notice it and do the
    2147             :              * end-of-backup processing.
    2148             :              */
    2149         154 :             elog(DEBUG1, "end of backup record reached");
    2150             : 
    2151         154 :             backupEndPoint = lsn;
    2152             :         }
    2153             :         else
    2154          34 :             elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
    2155             :                  LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
    2156             :     }
    2157       87538 : }
    2158             : 
    2159             : /*
    2160             :  * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
    2161             :  * directories.
    2162             :  *
    2163             :  * Replay of database creation XLOG records for databases that were later
    2164             :  * dropped can create fake directories in pg_tblspc.  By the time consistency
    2165             :  * is reached these directories should have been removed; here we verify
    2166             :  * that this did indeed happen.  This is to be called at the point where
    2167             :  * consistent state is reached.
    2168             :  *
    2169             :  * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
    2170             :  * useful for testing purposes, and also allows for an escape hatch in case
    2171             :  * things go south.
    2172             :  */
    2173             : static void
    2174         244 : CheckTablespaceDirectory(void)
    2175             : {
    2176             :     DIR        *dir;
    2177             :     struct dirent *de;
    2178             : 
    2179         244 :     dir = AllocateDir(PG_TBLSPC_DIR);
    2180         746 :     while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
    2181             :     {
    2182             :         char        path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
    2183             : 
    2184             :         /* Skip entries of non-oid names */
    2185         502 :         if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
    2186         488 :             continue;
    2187             : 
    2188          14 :         snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
    2189             : 
    2190          14 :         if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
    2191           8 :             ereport(allow_in_place_tablespaces ? WARNING : PANIC,
    2192             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    2193             :                      errmsg("unexpected directory entry \"%s\" found in %s",
    2194             :                             de->d_name, PG_TBLSPC_DIR),
    2195             :                      errdetail("All directory entries in %s/ should be symbolic links.",
    2196             :                                PG_TBLSPC_DIR),
    2197             :                      errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
    2198             :     }
    2199         244 : }
    2200             : 
    2201             : /*
    2202             :  * Checks if recovery has reached a consistent state. When consistency is
    2203             :  * reached and we have a valid starting standby snapshot, tell postmaster
    2204             :  * that it can start accepting read-only connections.
    2205             :  */
    2206             : static void
    2207     5538154 : CheckRecoveryConsistency(void)
    2208             : {
    2209             :     XLogRecPtr  lastReplayedEndRecPtr;
    2210             :     TimeLineID  lastReplayedTLI;
    2211             : 
    2212             :     /*
    2213             :      * During crash recovery, we don't reach a consistent state until we've
    2214             :      * replayed all the WAL.
    2215             :      */
    2216     5538154 :     if (!XLogRecPtrIsValid(minRecoveryPoint))
    2217      517684 :         return;
    2218             : 
    2219             :     Assert(InArchiveRecovery);
    2220             : 
    2221             :     /*
    2222             :      * assume that we are called in the startup process, and hence don't need
    2223             :      * a lock to read lastReplayedEndRecPtr
    2224             :      */
    2225     5020470 :     lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
    2226     5020470 :     lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
    2227             : 
    2228             :     /*
    2229             :      * Have we reached the point where our base backup was completed?
    2230             :      */
    2231     5020470 :     if (XLogRecPtrIsValid(backupEndPoint) &&
    2232         226 :         backupEndPoint <= lastReplayedEndRecPtr)
    2233             :     {
    2234         158 :         XLogRecPtr  saveBackupStartPoint = backupStartPoint;
    2235         158 :         XLogRecPtr  saveBackupEndPoint = backupEndPoint;
    2236             : 
    2237         158 :         elog(DEBUG1, "end of backup reached");
    2238             : 
    2239             :         /*
    2240             :          * We have reached the end of base backup, as indicated by pg_control.
    2241             :          * Update the control file accordingly.
    2242             :          */
    2243         158 :         ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
    2244         158 :         backupStartPoint = InvalidXLogRecPtr;
    2245         158 :         backupEndPoint = InvalidXLogRecPtr;
    2246         158 :         backupEndRequired = false;
    2247             : 
    2248         158 :         ereport(LOG,
    2249             :                 errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
    2250             :                        LSN_FORMAT_ARGS(saveBackupStartPoint),
    2251             :                        LSN_FORMAT_ARGS(saveBackupEndPoint)));
    2252             :     }
    2253             : 
    2254             :     /*
    2255             :      * Have we passed our safe starting point? Note that minRecoveryPoint is
    2256             :      * known to be incorrectly set if recovering from a backup, until the
    2257             :      * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
    2258             :      * All we know prior to that is that we're not consistent yet.
    2259             :      */
    2260     5020470 :     if (!reachedConsistency && !backupEndRequired &&
    2261       15358 :         minRecoveryPoint <= lastReplayedEndRecPtr)
    2262             :     {
    2263             :         /*
    2264             :          * Check to see if the XLOG sequence contained any unresolved
    2265             :          * references to uninitialized pages.
    2266             :          */
    2267         244 :         XLogCheckInvalidPages();
    2268             : 
    2269             :         /*
    2270             :          * Check that pg_tblspc doesn't contain any real directories. Replay
    2271             :          * of Database/CREATE_* records may have created fictitious tablespace
    2272             :          * directories that should have been removed by the time consistency
    2273             :          * was reached.
    2274             :          */
    2275         244 :         CheckTablespaceDirectory();
    2276             : 
    2277         244 :         reachedConsistency = true;
    2278         244 :         SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
    2279         244 :         ereport(LOG,
    2280             :                 errmsg("consistent recovery state reached at %X/%08X",
    2281             :                        LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
    2282             :     }
    2283             : 
    2284             :     /*
    2285             :      * Have we got a valid starting snapshot that will allow queries to be
    2286             :      * run? If so, we can tell postmaster that the database is consistent now,
    2287             :      * enabling connections.
    2288             :      */
    2289     5020470 :     if (standbyState == STANDBY_SNAPSHOT_READY &&
    2290     5019992 :         !LocalHotStandbyActive &&
    2291         228 :         reachedConsistency &&
    2292             :         IsUnderPostmaster)
    2293             :     {
    2294         228 :         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    2295         228 :         XLogRecoveryCtl->SharedHotStandbyActive = true;
    2296         228 :         SpinLockRelease(&XLogRecoveryCtl->info_lck);
    2297             : 
    2298         228 :         LocalHotStandbyActive = true;
    2299             : 
    2300         228 :         SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
    2301             :     }
    2302             : }
    2303             : 
    2304             : /*
    2305             :  * Error context callback for errors occurring during rm_redo().
    2306             :  */
    2307             : static void
    2308         296 : rm_redo_error_callback(void *arg)
    2309             : {
    2310         296 :     XLogReaderState *record = (XLogReaderState *) arg;
    2311             :     StringInfoData buf;
    2312             : 
    2313         296 :     initStringInfo(&buf);
    2314         296 :     xlog_outdesc(&buf, record);
    2315         296 :     xlog_block_info(&buf, record);
    2316             : 
    2317             :     /* translator: %s is a WAL record description */
    2318         296 :     errcontext("WAL redo at %X/%08X for %s",
    2319         296 :                LSN_FORMAT_ARGS(record->ReadRecPtr),
    2320             :                buf.data);
    2321             : 
    2322         296 :     pfree(buf.data);
    2323         296 : }
    2324             : 
    2325             : /*
    2326             :  * Returns a string describing an XLogRecord, consisting of its identity
    2327             :  * optionally followed by a colon, a space, and a further description.
    2328             :  */
    2329             : void
    2330         296 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
    2331             : {
    2332         296 :     RmgrData    rmgr = GetRmgr(XLogRecGetRmid(record));
    2333         296 :     uint8       info = XLogRecGetInfo(record);
    2334             :     const char *id;
    2335             : 
    2336         296 :     appendStringInfoString(buf, rmgr.rm_name);
    2337         296 :     appendStringInfoChar(buf, '/');
    2338             : 
    2339         296 :     id = rmgr.rm_identify(info);
    2340         296 :     if (id == NULL)
    2341           0 :         appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
    2342             :     else
    2343         296 :         appendStringInfo(buf, "%s: ", id);
    2344             : 
    2345         296 :     rmgr.rm_desc(buf, record);
    2346         296 : }
    2347             : 
    2348             : #ifdef WAL_DEBUG
    2349             : 
    2350             : static void
    2351             : xlog_outrec(StringInfo buf, XLogReaderState *record)
    2352             : {
    2353             :     appendStringInfo(buf, "prev %X/%08X; xid %u",
    2354             :                      LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
    2355             :                      XLogRecGetXid(record));
    2356             : 
    2357             :     appendStringInfo(buf, "; len %u",
    2358             :                      XLogRecGetDataLen(record));
    2359             : 
    2360             :     xlog_block_info(buf, record);
    2361             : }
    2362             : #endif                          /* WAL_DEBUG */
    2363             : 
    2364             : /*
    2365             :  * Returns a string giving information about all the blocks in an
    2366             :  * XLogRecord.
    2367             :  */
    2368             : static void
    2369         296 : xlog_block_info(StringInfo buf, XLogReaderState *record)
    2370             : {
    2371             :     int         block_id;
    2372             : 
    2373             :     /* decode block references */
    2374         388 :     for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
    2375             :     {
    2376             :         RelFileLocator rlocator;
    2377             :         ForkNumber  forknum;
    2378             :         BlockNumber blk;
    2379             : 
    2380          92 :         if (!XLogRecGetBlockTagExtended(record, block_id,
    2381             :                                         &rlocator, &forknum, &blk, NULL))
    2382           0 :             continue;
    2383             : 
    2384          92 :         if (forknum != MAIN_FORKNUM)
    2385           6 :             appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
    2386             :                              block_id,
    2387             :                              rlocator.spcOid, rlocator.dbOid,
    2388             :                              rlocator.relNumber,
    2389             :                              forknum,
    2390             :                              blk);
    2391             :         else
    2392          86 :             appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
    2393             :                              block_id,
    2394             :                              rlocator.spcOid, rlocator.dbOid,
    2395             :                              rlocator.relNumber,
    2396             :                              blk);
    2397          92 :         if (XLogRecHasBlockImage(record, block_id))
    2398          56 :             appendStringInfoString(buf, " FPW");
    2399             :     }
    2400         296 : }
    2401             : 
    2402             : 
    2403             : /*
    2404             :  * Check that it's OK to switch to new timeline during recovery.
    2405             :  *
    2406             :  * 'lsn' is the address of the shutdown checkpoint record we're about to
    2407             :  * replay. (Currently, timeline can only change at a shutdown checkpoint).
    2408             :  */
    2409             : static void
    2410          24 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
    2411             :                     TimeLineID replayTLI)
    2412             : {
    2413             :     /* Check that the record agrees on what the current (old) timeline is */
    2414          24 :     if (prevTLI != replayTLI)
    2415           0 :         ereport(PANIC,
    2416             :                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
    2417             :                         prevTLI, replayTLI)));
    2418             : 
    2419             :     /*
    2420             :      * The new timeline better be in the list of timelines we expect to see,
    2421             :      * according to the timeline history. It should also not decrease.
    2422             :      */
    2423          24 :     if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
    2424           0 :         ereport(PANIC,
    2425             :                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
    2426             :                         newTLI, replayTLI)));
    2427             : 
    2428             :     /*
    2429             :      * If we have not yet reached min recovery point, and we're about to
    2430             :      * switch to a timeline greater than the timeline of the min recovery
    2431             :      * point: trouble. After switching to the new timeline, we could not
    2432             :      * possibly visit the min recovery point on the correct timeline anymore.
    2433             :      * This can happen if there is a newer timeline in the archive that
    2434             :      * branched before the timeline the min recovery point is on, and you
    2435             :      * attempt to do PITR to the new timeline.
    2436             :      */
    2437          24 :     if (XLogRecPtrIsValid(minRecoveryPoint) &&
    2438          20 :         lsn < minRecoveryPoint &&
    2439           2 :         newTLI > minRecoveryPointTLI)
    2440           0 :         ereport(PANIC,
    2441             :                 errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
    2442             :                        newTLI,
    2443             :                        LSN_FORMAT_ARGS(minRecoveryPoint),
    2444             :                        minRecoveryPointTLI));
    2445             : 
    2446             :     /* Looks good */
    2447          24 : }
    2448             : 
    2449             : 
    2450             : /*
    2451             :  * Extract timestamp from WAL record.
    2452             :  *
    2453             :  * If the record contains a timestamp, returns true, and saves the timestamp
    2454             :  * in *recordXtime. If the record type has no timestamp, returns false.
    2455             :  * Currently, only transaction commit/abort records and restore points contain
    2456             :  * timestamps.
    2457             :  */
    2458             : static bool
    2459       88890 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
    2460             : {
    2461       88890 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    2462       88890 :     uint8       xact_info = info & XLOG_XACT_OPMASK;
    2463       88890 :     uint8       rmid = XLogRecGetRmid(record);
    2464             : 
    2465       88890 :     if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
    2466             :     {
    2467           4 :         *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
    2468           4 :         return true;
    2469             :     }
    2470       88886 :     if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
    2471             :                                xact_info == XLOG_XACT_COMMIT_PREPARED))
    2472             :     {
    2473       81442 :         *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
    2474       81442 :         return true;
    2475             :     }
    2476        7444 :     if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
    2477             :                                xact_info == XLOG_XACT_ABORT_PREPARED))
    2478             :     {
    2479        7444 :         *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
    2480        7444 :         return true;
    2481             :     }
    2482           0 :     return false;
    2483             : }
    2484             : 
    2485             : /*
    2486             :  * Checks whether the current buffer page and backup page stored in the
    2487             :  * WAL record are consistent or not. Before comparing the two pages, a
    2488             :  * masking can be applied to the pages to ignore certain areas like hint bits,
    2489             :  * unused space between pd_lower and pd_upper among other things. This
    2490             :  * function should be called once WAL replay has been completed for a
    2491             :  * given record.
    2492             :  */
    2493             : static void
    2494     4408750 : verifyBackupPageConsistency(XLogReaderState *record)
    2495             : {
    2496     4408750 :     RmgrData    rmgr = GetRmgr(XLogRecGetRmid(record));
    2497             :     RelFileLocator rlocator;
    2498             :     ForkNumber  forknum;
    2499             :     BlockNumber blkno;
    2500             :     int         block_id;
    2501             : 
    2502             :     /* Records with no backup blocks have no need for consistency checks. */
    2503     4408750 :     if (!XLogRecHasAnyBlockRefs(record))
    2504         106 :         return;
    2505             : 
    2506             :     Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
    2507             : 
    2508     9153514 :     for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
    2509             :     {
    2510             :         Buffer      buf;
    2511             :         Page        page;
    2512             : 
    2513     4744870 :         if (!XLogRecGetBlockTagExtended(record, block_id,
    2514             :                                         &rlocator, &forknum, &blkno, NULL))
    2515             :         {
    2516             :             /*
    2517             :              * WAL record doesn't contain a block reference with the given id.
    2518             :              * Do nothing.
    2519             :              */
    2520        4022 :             continue;
    2521             :         }
    2522             : 
    2523             :         Assert(XLogRecHasBlockImage(record, block_id));
    2524             : 
    2525     4740848 :         if (XLogRecBlockImageApply(record, block_id))
    2526             :         {
    2527             :             /*
    2528             :              * WAL record has already applied the page, so bypass the
    2529             :              * consistency check as that would result in comparing the full
    2530             :              * page stored in the record with itself.
    2531             :              */
    2532       51084 :             continue;
    2533             :         }
    2534             : 
    2535             :         /*
    2536             :          * Read the contents from the current buffer and store it in a
    2537             :          * temporary page.
    2538             :          */
    2539     4689764 :         buf = XLogReadBufferExtended(rlocator, forknum, blkno,
    2540             :                                      RBM_NORMAL_NO_LOG,
    2541             :                                      InvalidBuffer);
    2542     4689764 :         if (!BufferIsValid(buf))
    2543           0 :             continue;
    2544             : 
    2545     4689764 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
    2546     4689764 :         page = BufferGetPage(buf);
    2547             : 
    2548             :         /*
    2549             :          * Take a copy of the local page where WAL has been applied to have a
    2550             :          * comparison base before masking it...
    2551             :          */
    2552     4689764 :         memcpy(replay_image_masked, page, BLCKSZ);
    2553             : 
    2554             :         /* No need for this page anymore now that a copy is in. */
    2555     4689764 :         UnlockReleaseBuffer(buf);
    2556             : 
    2557             :         /*
    2558             :          * If the block LSN is already ahead of this WAL record, we can't
    2559             :          * expect contents to match.  This can happen if recovery is
    2560             :          * restarted.
    2561             :          */
    2562     4689764 :         if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
    2563           0 :             continue;
    2564             : 
    2565             :         /*
    2566             :          * Read the contents from the backup copy, stored in WAL record and
    2567             :          * store it in a temporary page. There is no need to allocate a new
    2568             :          * page here, a local buffer is fine to hold its contents and a mask
    2569             :          * can be directly applied on it.
    2570             :          */
    2571     4689764 :         if (!RestoreBlockImage(record, block_id, primary_image_masked))
    2572           0 :             ereport(ERROR,
    2573             :                     (errcode(ERRCODE_INTERNAL_ERROR),
    2574             :                      errmsg_internal("%s", record->errormsg_buf)));
    2575             : 
    2576             :         /*
    2577             :          * If masking function is defined, mask both the primary and replay
    2578             :          * images
    2579             :          */
    2580     4689764 :         if (rmgr.rm_mask != NULL)
    2581             :         {
    2582     4689764 :             rmgr.rm_mask(replay_image_masked, blkno);
    2583     4689764 :             rmgr.rm_mask(primary_image_masked, blkno);
    2584             :         }
    2585             : 
    2586             :         /* Time to compare the primary and replay images. */
    2587     4689764 :         if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
    2588             :         {
    2589           0 :             elog(FATAL,
    2590             :                  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
    2591             :                  rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
    2592             :                  forknum, blkno);
    2593             :         }
    2594             :     }
    2595             : }
    2596             : 
    2597             : /*
    2598             :  * For point-in-time recovery, this function decides whether we want to
    2599             :  * stop applying the XLOG before the current record.
    2600             :  *
    2601             :  * Returns true if we are stopping, false otherwise. If stopping, some
    2602             :  * information is saved in recoveryStopXid et al for use in annotating the
    2603             :  * new timeline's history file.
    2604             :  */
    2605             : static bool
    2606     5537712 : recoveryStopsBefore(XLogReaderState *record)
    2607             : {
    2608     5537712 :     bool        stopsHere = false;
    2609             :     uint8       xact_info;
    2610             :     bool        isCommit;
    2611     5537712 :     TimestampTz recordXtime = 0;
    2612             :     TransactionId recordXid;
    2613             : 
    2614             :     /*
    2615             :      * Ignore recovery target settings when not in archive recovery (meaning
    2616             :      * we are in crash recovery).
    2617             :      */
    2618     5537712 :     if (!ArchiveRecoveryRequested)
    2619      488082 :         return false;
    2620             : 
    2621             :     /* Check if we should stop as soon as reaching consistency */
    2622     5049630 :     if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
    2623             :     {
    2624           0 :         ereport(LOG,
    2625             :                 (errmsg("recovery stopping after reaching consistency")));
    2626             : 
    2627           0 :         recoveryStopAfter = false;
    2628           0 :         recoveryStopXid = InvalidTransactionId;
    2629           0 :         recoveryStopLSN = InvalidXLogRecPtr;
    2630           0 :         recoveryStopTime = 0;
    2631           0 :         recoveryStopName[0] = '\0';
    2632           0 :         return true;
    2633             :     }
    2634             : 
    2635             :     /* Check if target LSN has been reached */
    2636     5049630 :     if (recoveryTarget == RECOVERY_TARGET_LSN &&
    2637       17042 :         !recoveryTargetInclusive &&
    2638         982 :         record->ReadRecPtr >= recoveryTargetLSN)
    2639             :     {
    2640           4 :         recoveryStopAfter = false;
    2641           4 :         recoveryStopXid = InvalidTransactionId;
    2642           4 :         recoveryStopLSN = record->ReadRecPtr;
    2643           4 :         recoveryStopTime = 0;
    2644           4 :         recoveryStopName[0] = '\0';
    2645           4 :         ereport(LOG,
    2646             :                 errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
    2647             :                        LSN_FORMAT_ARGS(recoveryStopLSN)));
    2648           4 :         return true;
    2649             :     }
    2650             : 
    2651             :     /* Otherwise we only consider stopping before COMMIT or ABORT records. */
    2652     5049626 :     if (XLogRecGetRmid(record) != RM_XACT_ID)
    2653     5004594 :         return false;
    2654             : 
    2655       45032 :     xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
    2656             : 
    2657       45032 :     if (xact_info == XLOG_XACT_COMMIT)
    2658             :     {
    2659       40634 :         isCommit = true;
    2660       40634 :         recordXid = XLogRecGetXid(record);
    2661             :     }
    2662        4398 :     else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
    2663             :     {
    2664          60 :         xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
    2665             :         xl_xact_parsed_commit parsed;
    2666             : 
    2667          60 :         isCommit = true;
    2668          60 :         ParseCommitRecord(XLogRecGetInfo(record),
    2669             :                           xlrec,
    2670             :                           &parsed);
    2671          60 :         recordXid = parsed.twophase_xid;
    2672             :     }
    2673        4338 :     else if (xact_info == XLOG_XACT_ABORT)
    2674             :     {
    2675        3696 :         isCommit = false;
    2676        3696 :         recordXid = XLogRecGetXid(record);
    2677             :     }
    2678         642 :     else if (xact_info == XLOG_XACT_ABORT_PREPARED)
    2679             :     {
    2680          26 :         xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
    2681             :         xl_xact_parsed_abort parsed;
    2682             : 
    2683          26 :         isCommit = false;
    2684          26 :         ParseAbortRecord(XLogRecGetInfo(record),
    2685             :                          xlrec,
    2686             :                          &parsed);
    2687          26 :         recordXid = parsed.twophase_xid;
    2688             :     }
    2689             :     else
    2690         616 :         return false;
    2691             : 
    2692       44416 :     if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
    2693             :     {
    2694             :         /*
    2695             :          * There can be only one transaction end record with this exact
    2696             :          * transactionid
    2697             :          *
    2698             :          * when testing for an xid, we MUST test for equality only, since
    2699             :          * transactions are numbered in the order they start, not the order
    2700             :          * they complete. A higher numbered xid will complete before you about
    2701             :          * 50% of the time...
    2702             :          */
    2703           0 :         stopsHere = (recordXid == recoveryTargetXid);
    2704             :     }
    2705             : 
    2706             :     /*
    2707             :      * Note: we must fetch recordXtime regardless of recoveryTarget setting.
    2708             :      * We don't expect getRecordTimestamp ever to fail, since we already know
    2709             :      * this is a commit or abort record; but test its result anyway.
    2710             :      */
    2711       44416 :     if (getRecordTimestamp(record, &recordXtime) &&
    2712       44416 :         recoveryTarget == RECOVERY_TARGET_TIME)
    2713             :     {
    2714             :         /*
    2715             :          * There can be many transactions that share the same commit time, so
    2716             :          * we stop after the last one, if we are inclusive, or stop at the
    2717             :          * first one if we are exclusive
    2718             :          */
    2719           0 :         if (recoveryTargetInclusive)
    2720           0 :             stopsHere = (recordXtime > recoveryTargetTime);
    2721             :         else
    2722           0 :             stopsHere = (recordXtime >= recoveryTargetTime);
    2723             :     }
    2724             : 
    2725       44416 :     if (stopsHere)
    2726             :     {
    2727           0 :         recoveryStopAfter = false;
    2728           0 :         recoveryStopXid = recordXid;
    2729           0 :         recoveryStopTime = recordXtime;
    2730           0 :         recoveryStopLSN = InvalidXLogRecPtr;
    2731           0 :         recoveryStopName[0] = '\0';
    2732             : 
    2733           0 :         if (isCommit)
    2734             :         {
    2735           0 :             ereport(LOG,
    2736             :                     (errmsg("recovery stopping before commit of transaction %u, time %s",
    2737             :                             recoveryStopXid,
    2738             :                             timestamptz_to_str(recoveryStopTime))));
    2739             :         }
    2740             :         else
    2741             :         {
    2742           0 :             ereport(LOG,
    2743             :                     (errmsg("recovery stopping before abort of transaction %u, time %s",
    2744             :                             recoveryStopXid,
    2745             :                             timestamptz_to_str(recoveryStopTime))));
    2746             :         }
    2747             :     }
    2748             : 
    2749       44416 :     return stopsHere;
    2750             : }
    2751             : 
    2752             : /*
    2753             :  * Same as recoveryStopsBefore, but called after applying the record.
    2754             :  *
    2755             :  * We also track the timestamp of the latest applied COMMIT/ABORT
    2756             :  * record in XLogRecoveryCtl->recoveryLastXTime.
    2757             :  */
    2758             : static bool
    2759     5537704 : recoveryStopsAfter(XLogReaderState *record)
    2760             : {
    2761             :     uint8       info;
    2762             :     uint8       xact_info;
    2763             :     uint8       rmid;
    2764     5537704 :     TimestampTz recordXtime = 0;
    2765             : 
    2766             :     /*
    2767             :      * Ignore recovery target settings when not in archive recovery (meaning
    2768             :      * we are in crash recovery).
    2769             :      */
    2770     5537704 :     if (!ArchiveRecoveryRequested)
    2771      488082 :         return false;
    2772             : 
    2773     5049622 :     info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    2774     5049622 :     rmid = XLogRecGetRmid(record);
    2775             : 
    2776             :     /*
    2777             :      * There can be many restore points that share the same name; we stop at
    2778             :      * the first one.
    2779             :      */
    2780     5049622 :     if (recoveryTarget == RECOVERY_TARGET_NAME &&
    2781          40 :         rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
    2782             :     {
    2783             :         xl_restore_point *recordRestorePointData;
    2784             : 
    2785           6 :         recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
    2786             : 
    2787           6 :         if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
    2788             :         {
    2789           4 :             recoveryStopAfter = true;
    2790           4 :             recoveryStopXid = InvalidTransactionId;
    2791           4 :             recoveryStopLSN = InvalidXLogRecPtr;
    2792           4 :             (void) getRecordTimestamp(record, &recoveryStopTime);
    2793           4 :             strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
    2794             : 
    2795           4 :             ereport(LOG,
    2796             :                     (errmsg("recovery stopping at restore point \"%s\", time %s",
    2797             :                             recoveryStopName,
    2798             :                             timestamptz_to_str(recoveryStopTime))));
    2799           4 :             return true;
    2800             :         }
    2801             :     }
    2802             : 
    2803             :     /* Check if the target LSN has been reached */
    2804     5049618 :     if (recoveryTarget == RECOVERY_TARGET_LSN &&
    2805       16060 :         recoveryTargetInclusive &&
    2806       16060 :         record->ReadRecPtr >= recoveryTargetLSN)
    2807             :     {
    2808           6 :         recoveryStopAfter = true;
    2809           6 :         recoveryStopXid = InvalidTransactionId;
    2810           6 :         recoveryStopLSN = record->ReadRecPtr;
    2811           6 :         recoveryStopTime = 0;
    2812           6 :         recoveryStopName[0] = '\0';
    2813           6 :         ereport(LOG,
    2814             :                 errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
    2815             :                        LSN_FORMAT_ARGS(recoveryStopLSN)));
    2816           6 :         return true;
    2817             :     }
    2818             : 
    2819     5049612 :     if (rmid != RM_XACT_ID)
    2820     5004584 :         return false;
    2821             : 
    2822       45028 :     xact_info = info & XLOG_XACT_OPMASK;
    2823             : 
    2824       45028 :     if (xact_info == XLOG_XACT_COMMIT ||
    2825        4338 :         xact_info == XLOG_XACT_COMMIT_PREPARED ||
    2826         642 :         xact_info == XLOG_XACT_ABORT ||
    2827             :         xact_info == XLOG_XACT_ABORT_PREPARED)
    2828             :     {
    2829             :         TransactionId recordXid;
    2830             : 
    2831             :         /* Update the last applied transaction timestamp */
    2832       44412 :         if (getRecordTimestamp(record, &recordXtime))
    2833       44412 :             SetLatestXTime(recordXtime);
    2834             : 
    2835             :         /* Extract the XID of the committed/aborted transaction */
    2836       44412 :         if (xact_info == XLOG_XACT_COMMIT_PREPARED)
    2837             :         {
    2838          60 :             xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
    2839             :             xl_xact_parsed_commit parsed;
    2840             : 
    2841          60 :             ParseCommitRecord(XLogRecGetInfo(record),
    2842             :                               xlrec,
    2843             :                               &parsed);
    2844          60 :             recordXid = parsed.twophase_xid;
    2845             :         }
    2846       44352 :         else if (xact_info == XLOG_XACT_ABORT_PREPARED)
    2847             :         {
    2848          26 :             xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
    2849             :             xl_xact_parsed_abort parsed;
    2850             : 
    2851          26 :             ParseAbortRecord(XLogRecGetInfo(record),
    2852             :                              xlrec,
    2853             :                              &parsed);
    2854          26 :             recordXid = parsed.twophase_xid;
    2855             :         }
    2856             :         else
    2857       44326 :             recordXid = XLogRecGetXid(record);
    2858             : 
    2859             :         /*
    2860             :          * There can be only one transaction end record with this exact
    2861             :          * transactionid
    2862             :          *
    2863             :          * when testing for an xid, we MUST test for equality only, since
    2864             :          * transactions are numbered in the order they start, not the order
    2865             :          * they complete. A higher numbered xid will complete before you about
    2866             :          * 50% of the time...
    2867             :          */
    2868       44412 :         if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
    2869           0 :             recordXid == recoveryTargetXid)
    2870             :         {
    2871           0 :             recoveryStopAfter = true;
    2872           0 :             recoveryStopXid = recordXid;
    2873           0 :             recoveryStopTime = recordXtime;
    2874           0 :             recoveryStopLSN = InvalidXLogRecPtr;
    2875           0 :             recoveryStopName[0] = '\0';
    2876             : 
    2877           0 :             if (xact_info == XLOG_XACT_COMMIT ||
    2878             :                 xact_info == XLOG_XACT_COMMIT_PREPARED)
    2879             :             {
    2880           0 :                 ereport(LOG,
    2881             :                         (errmsg("recovery stopping after commit of transaction %u, time %s",
    2882             :                                 recoveryStopXid,
    2883             :                                 timestamptz_to_str(recoveryStopTime))));
    2884             :             }
    2885           0 :             else if (xact_info == XLOG_XACT_ABORT ||
    2886             :                      xact_info == XLOG_XACT_ABORT_PREPARED)
    2887             :             {
    2888           0 :                 ereport(LOG,
    2889             :                         (errmsg("recovery stopping after abort of transaction %u, time %s",
    2890             :                                 recoveryStopXid,
    2891             :                                 timestamptz_to_str(recoveryStopTime))));
    2892             :             }
    2893           0 :             return true;
    2894             :         }
    2895             :     }
    2896             : 
    2897             :     /* Check if we should stop as soon as reaching consistency */
    2898       45028 :     if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
    2899             :     {
    2900           0 :         ereport(LOG,
    2901             :                 (errmsg("recovery stopping after reaching consistency")));
    2902             : 
    2903           0 :         recoveryStopAfter = true;
    2904           0 :         recoveryStopXid = InvalidTransactionId;
    2905           0 :         recoveryStopTime = 0;
    2906           0 :         recoveryStopLSN = InvalidXLogRecPtr;
    2907           0 :         recoveryStopName[0] = '\0';
    2908           0 :         return true;
    2909             :     }
    2910             : 
    2911       45028 :     return false;
    2912             : }
    2913             : 
    2914             : /*
    2915             :  * Create a comment for the history file to explain why and where
    2916             :  * timeline changed.
    2917             :  */
    2918             : static char *
    2919        1854 : getRecoveryStopReason(void)
    2920             : {
    2921             :     char        reason[200];
    2922             : 
    2923        1854 :     if (recoveryTarget == RECOVERY_TARGET_XID)
    2924           0 :         snprintf(reason, sizeof(reason),
    2925             :                  "%s transaction %u",
    2926           0 :                  recoveryStopAfter ? "after" : "before",
    2927             :                  recoveryStopXid);
    2928        1854 :     else if (recoveryTarget == RECOVERY_TARGET_TIME)
    2929           0 :         snprintf(reason, sizeof(reason),
    2930             :                  "%s %s\n",
    2931           0 :                  recoveryStopAfter ? "after" : "before",
    2932             :                  timestamptz_to_str(recoveryStopTime));
    2933        1854 :     else if (recoveryTarget == RECOVERY_TARGET_LSN)
    2934          12 :         snprintf(reason, sizeof(reason),
    2935             :                  "%s LSN %X/%08X\n",
    2936          12 :                  recoveryStopAfter ? "after" : "before",
    2937          12 :                  LSN_FORMAT_ARGS(recoveryStopLSN));
    2938        1842 :     else if (recoveryTarget == RECOVERY_TARGET_NAME)
    2939           6 :         snprintf(reason, sizeof(reason),
    2940             :                  "at restore point \"%s\"",
    2941             :                  recoveryStopName);
    2942        1836 :     else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
    2943           0 :         snprintf(reason, sizeof(reason), "reached consistency");
    2944             :     else
    2945        1836 :         snprintf(reason, sizeof(reason), "no recovery target specified");
    2946             : 
    2947        1854 :     return pstrdup(reason);
    2948             : }
    2949             : 
    2950             : /*
    2951             :  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
    2952             :  *
    2953             :  * endOfRecovery is true if the recovery target is reached and
    2954             :  * the paused state starts at the end of recovery because of
    2955             :  * recovery_target_action=pause, and false otherwise.
    2956             :  */
    2957             : static void
    2958           8 : recoveryPausesHere(bool endOfRecovery)
    2959             : {
    2960             :     /* Don't pause unless users can connect! */
    2961           8 :     if (!LocalHotStandbyActive)
    2962           0 :         return;
    2963             : 
    2964             :     /* Don't pause after standby promotion has been triggered */
    2965           8 :     if (LocalPromoteIsTriggered)
    2966           0 :         return;
    2967             : 
    2968           8 :     if (endOfRecovery)
    2969           2 :         ereport(LOG,
    2970             :                 (errmsg("pausing at the end of recovery"),
    2971             :                  errhint("Execute pg_wal_replay_resume() to promote.")));
    2972             :     else
    2973           6 :         ereport(LOG,
    2974             :                 (errmsg("recovery has paused"),
    2975             :                  errhint("Execute pg_wal_replay_resume() to continue.")));
    2976             : 
    2977             :     /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
    2978          26 :     while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
    2979             :     {
    2980          22 :         ProcessStartupProcInterrupts();
    2981          22 :         if (CheckForStandbyTrigger())
    2982           4 :             return;
    2983             : 
    2984             :         /*
    2985             :          * If recovery pause is requested then set it paused.  While we are in
    2986             :          * the loop, user might resume and pause again so set this every time.
    2987             :          */
    2988          18 :         ConfirmRecoveryPaused();
    2989             : 
    2990             :         /*
    2991             :          * We wait on a condition variable that will wake us as soon as the
    2992             :          * pause ends, but we use a timeout so we can check the above exit
    2993             :          * condition periodically too.
    2994             :          */
    2995          18 :         ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
    2996             :                                     WAIT_EVENT_RECOVERY_PAUSE);
    2997             :     }
    2998           4 :     ConditionVariableCancelSleep();
    2999             : }
    3000             : 
    3001             : /*
    3002             :  * When recovery_min_apply_delay is set, we wait long enough to make sure
    3003             :  * certain record types are applied at least that interval behind the primary.
    3004             :  *
    3005             :  * Returns true if we waited.
    3006             :  *
    3007             :  * Note that the delay is calculated between the WAL record log time and
    3008             :  * the current time on standby. We would prefer to keep track of when this
    3009             :  * standby received each WAL record, which would allow a more consistent
    3010             :  * approach and one not affected by time synchronisation issues, but that
    3011             :  * is significantly more effort and complexity for little actual gain in
    3012             :  * usability.
    3013             :  */
    3014             : static bool
    3015     5537708 : recoveryApplyDelay(XLogReaderState *record)
    3016             : {
    3017             :     uint8       xact_info;
    3018             :     TimestampTz xtime;
    3019             :     TimestampTz delayUntil;
    3020             :     long        msecs;
    3021             : 
    3022             :     /* nothing to do if no delay configured */
    3023     5537708 :     if (recovery_min_apply_delay <= 0)
    3024     5537424 :         return false;
    3025             : 
    3026             :     /* no delay is applied on a database not yet consistent */
    3027         284 :     if (!reachedConsistency)
    3028           8 :         return false;
    3029             : 
    3030             :     /* nothing to do if crash recovery is requested */
    3031         276 :     if (!ArchiveRecoveryRequested)
    3032           0 :         return false;
    3033             : 
    3034             :     /*
    3035             :      * Is it a COMMIT record?
    3036             :      *
    3037             :      * We deliberately choose not to delay aborts since they have no effect on
    3038             :      * MVCC. We already allow replay of records that don't have a timestamp,
    3039             :      * so there is already opportunity for issues caused by early conflicts on
    3040             :      * standbys.
    3041             :      */
    3042         276 :     if (XLogRecGetRmid(record) != RM_XACT_ID)
    3043         218 :         return false;
    3044             : 
    3045          58 :     xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
    3046             : 
    3047          58 :     if (xact_info != XLOG_XACT_COMMIT &&
    3048             :         xact_info != XLOG_XACT_COMMIT_PREPARED)
    3049           0 :         return false;
    3050             : 
    3051          58 :     if (!getRecordTimestamp(record, &xtime))
    3052           0 :         return false;
    3053             : 
    3054          58 :     delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
    3055             : 
    3056             :     /*
    3057             :      * Exit without arming the latch if it's already past time to apply this
    3058             :      * record
    3059             :      */
    3060          58 :     msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
    3061          58 :     if (msecs <= 0)
    3062           2 :         return false;
    3063             : 
    3064             :     while (true)
    3065             :     {
    3066         286 :         ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
    3067             : 
    3068             :         /* This might change recovery_min_apply_delay. */
    3069         286 :         ProcessStartupProcInterrupts();
    3070             : 
    3071         286 :         if (CheckForStandbyTrigger())
    3072           0 :             break;
    3073             : 
    3074             :         /*
    3075             :          * Recalculate delayUntil as recovery_min_apply_delay could have
    3076             :          * changed while waiting in this loop.
    3077             :          */
    3078         286 :         delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
    3079             : 
    3080             :         /*
    3081             :          * Wait for difference between GetCurrentTimestamp() and delayUntil.
    3082             :          */
    3083         286 :         msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
    3084             :                                                 delayUntil);
    3085             : 
    3086         286 :         if (msecs <= 0)
    3087          56 :             break;
    3088             : 
    3089         230 :         elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
    3090             : 
    3091         230 :         (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
    3092             :                          WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
    3093             :                          msecs,
    3094             :                          WAIT_EVENT_RECOVERY_APPLY_DELAY);
    3095             :     }
    3096          56 :     return true;
    3097             : }
    3098             : 
    3099             : /*
    3100             :  * Get the current state of the recovery pause request.
    3101             :  */
    3102             : RecoveryPauseState
    3103          58 : GetRecoveryPauseState(void)
    3104             : {
    3105             :     RecoveryPauseState state;
    3106             : 
    3107          58 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    3108          58 :     state = XLogRecoveryCtl->recoveryPauseState;
    3109          58 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    3110             : 
    3111          58 :     return state;
    3112             : }
    3113             : 
    3114             : /*
    3115             :  * Set the recovery pause state.
    3116             :  *
    3117             :  * If recovery pause is requested then sets the recovery pause state to
    3118             :  * 'pause requested' if it is not already 'paused'.  Otherwise, sets it
    3119             :  * to 'not paused' to resume the recovery.  The recovery pause will be
    3120             :  * confirmed by the ConfirmRecoveryPaused.
    3121             :  */
    3122             : void
    3123         110 : SetRecoveryPause(bool recoveryPause)
    3124             : {
    3125         110 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    3126             : 
    3127         110 :     if (!recoveryPause)
    3128         100 :         XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
    3129          10 :     else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
    3130          10 :         XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
    3131             : 
    3132         110 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    3133             : 
    3134         110 :     if (!recoveryPause)
    3135         100 :         ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
    3136         110 : }
    3137             : 
    3138             : /*
    3139             :  * Confirm the recovery pause by setting the recovery pause state to
    3140             :  * RECOVERY_PAUSED.
    3141             :  */
    3142             : static void
    3143          18 : ConfirmRecoveryPaused(void)
    3144             : {
    3145             :     /* If recovery pause is requested then set it paused */
    3146          18 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    3147          18 :     if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
    3148           8 :         XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
    3149          18 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    3150          18 : }
    3151             : 
    3152             : 
    3153             : /*
    3154             :  * Attempt to read the next XLOG record.
    3155             :  *
    3156             :  * Before first call, the reader needs to be positioned to the first record
    3157             :  * by calling XLogPrefetcherBeginRead().
    3158             :  *
    3159             :  * If no valid record is available, returns NULL, or fails if emode is PANIC.
    3160             :  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
    3161             :  * record is available.
    3162             :  */
    3163             : static XLogRecord *
    3164     5542224 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
    3165             :            bool fetching_ckpt, TimeLineID replayTLI)
    3166             : {
    3167             :     XLogRecord *record;
    3168     5542224 :     XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
    3169     5542224 :     XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
    3170             : 
    3171             :     Assert(AmStartupProcess() || !IsUnderPostmaster);
    3172             : 
    3173             :     /* Pass through parameters to XLogPageRead */
    3174     5542224 :     private->fetching_ckpt = fetching_ckpt;
    3175     5542224 :     private->emode = emode;
    3176     5542224 :     private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr);
    3177     5542224 :     private->replayTLI = replayTLI;
    3178             : 
    3179             :     /* This is the first attempt to read this page. */
    3180     5542224 :     lastSourceFailed = false;
    3181             : 
    3182             :     for (;;)
    3183         290 :     {
    3184             :         char       *errormsg;
    3185             : 
    3186     5542514 :         record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
    3187     5542396 :         if (record == NULL)
    3188             :         {
    3189             :             /*
    3190             :              * When we find that WAL ends in an incomplete record, keep track
    3191             :              * of that record.  After recovery is done, we'll write a record
    3192             :              * to indicate to downstream WAL readers that that portion is to
    3193             :              * be ignored.
    3194             :              *
    3195             :              * However, when ArchiveRecoveryRequested = true, we're going to
    3196             :              * switch to a new timeline at the end of recovery. We will only
    3197             :              * copy WAL over to the new timeline up to the end of the last
    3198             :              * complete record, so if we did this, we would later create an
    3199             :              * overwrite contrecord in the wrong place, breaking everything.
    3200             :              */
    3201         602 :             if (!ArchiveRecoveryRequested &&
    3202         216 :                 XLogRecPtrIsValid(xlogreader->abortedRecPtr))
    3203             :             {
    3204          22 :                 abortedRecPtr = xlogreader->abortedRecPtr;
    3205          22 :                 missingContrecPtr = xlogreader->missingContrecPtr;
    3206             :             }
    3207             : 
    3208         602 :             if (readFile >= 0)
    3209             :             {
    3210         554 :                 close(readFile);
    3211         554 :                 readFile = -1;
    3212             :             }
    3213             : 
    3214             :             /*
    3215             :              * We only end up here without a message when XLogPageRead()
    3216             :              * failed - in that case we already logged something. In
    3217             :              * StandbyMode that only happens if we have been triggered, so we
    3218             :              * shouldn't loop anymore in that case.
    3219             :              */
    3220         602 :             if (errormsg)
    3221         554 :                 ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
    3222             :                         (errmsg_internal("%s", errormsg) /* already translated */ ));
    3223             :         }
    3224             : 
    3225             :         /*
    3226             :          * Check page TLI is one of the expected values.
    3227             :          */
    3228     5541794 :         else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
    3229             :         {
    3230             :             char        fname[MAXFNAMELEN];
    3231             :             XLogSegNo   segno;
    3232             :             int32       offset;
    3233             : 
    3234           0 :             XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
    3235           0 :             offset = XLogSegmentOffset(xlogreader->latestPagePtr,
    3236             :                                        wal_segment_size);
    3237           0 :             XLogFileName(fname, xlogreader->seg.ws_tli, segno,
    3238             :                          wal_segment_size);
    3239           0 :             ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
    3240             :                     errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
    3241             :                            xlogreader->latestPageTLI,
    3242             :                            fname,
    3243             :                            LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
    3244             :                            offset));
    3245           0 :             record = NULL;
    3246             :         }
    3247             : 
    3248     5542396 :         if (record)
    3249             :         {
    3250             :             /* Great, got a record */
    3251     5542106 :             return record;
    3252             :         }
    3253             :         else
    3254             :         {
    3255             :             /* No valid record available from this source */
    3256         602 :             lastSourceFailed = true;
    3257             : 
    3258             :             /*
    3259             :              * If archive recovery was requested, but we were still doing
    3260             :              * crash recovery, switch to archive recovery and retry using the
    3261             :              * offline archive. We have now replayed all the valid WAL in
    3262             :              * pg_wal, so we are presumably now consistent.
    3263             :              *
    3264             :              * We require that there's at least some valid WAL present in
    3265             :              * pg_wal, however (!fetching_ckpt).  We could recover using the
    3266             :              * WAL from the archive, even if pg_wal is completely empty, but
    3267             :              * we'd have no idea how far we'd have to replay to reach
    3268             :              * consistency.  So err on the safe side and give up.
    3269             :              */
    3270         602 :             if (!InArchiveRecovery && ArchiveRecoveryRequested &&
    3271           4 :                 !fetching_ckpt)
    3272             :             {
    3273           4 :                 ereport(DEBUG1,
    3274             :                         (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
    3275           4 :                 InArchiveRecovery = true;
    3276           4 :                 if (StandbyModeRequested)
    3277           4 :                     EnableStandbyMode();
    3278             : 
    3279           4 :                 SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
    3280           4 :                 minRecoveryPoint = xlogreader->EndRecPtr;
    3281           4 :                 minRecoveryPointTLI = replayTLI;
    3282             : 
    3283           4 :                 CheckRecoveryConsistency();
    3284             : 
    3285             :                 /*
    3286             :                  * Before we retry, reset lastSourceFailed and currentSource
    3287             :                  * so that we will check the archive next.
    3288             :                  */
    3289           4 :                 lastSourceFailed = false;
    3290           4 :                 currentSource = XLOG_FROM_ANY;
    3291             : 
    3292         290 :                 continue;
    3293             :             }
    3294             : 
    3295             :             /* In standby mode, loop back to retry. Otherwise, give up. */
    3296         598 :             if (StandbyMode && !CheckForStandbyTrigger())
    3297         286 :                 continue;
    3298             :             else
    3299         312 :                 return NULL;
    3300             :         }
    3301             :     }
    3302             : }
    3303             : 
    3304             : /*
    3305             :  * Read the XLOG page containing targetPagePtr into readBuf (if not read
    3306             :  * already).  Returns number of bytes read, if the page is read successfully,
    3307             :  * or XLREAD_FAIL in case of errors.  When errors occur, they are ereport'ed,
    3308             :  * but only if they have not been previously reported.
    3309             :  *
    3310             :  * See XLogReaderRoutine.page_read for more details.
    3311             :  *
    3312             :  * While prefetching, xlogreader->nonblocking may be set.  In that case,
    3313             :  * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
    3314             :  *
    3315             :  * This is responsible for restoring files from archive as needed, as well
    3316             :  * as for waiting for the requested WAL record to arrive in standby mode.
    3317             :  *
    3318             :  * xlogreader->private_data->emode specifies the log level used for reporting
    3319             :  * "file not found" or "end of WAL" situations in archive recovery, or in
    3320             :  * standby mode when promotion is triggered. If set to WARNING or below,
    3321             :  * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
    3322             :  * levels the ereport() won't return.
    3323             :  *
    3324             :  * In standby mode, if after a successful return of XLogPageRead() the
    3325             :  * caller finds the record it's interested in to be broken, it should
    3326             :  * ereport the error with the level determined by
    3327             :  * emode_for_corrupt_record(), and then set lastSourceFailed
    3328             :  * and call XLogPageRead() again with the same arguments. This lets
    3329             :  * XLogPageRead() to try fetching the record from another source, or to
    3330             :  * sleep and retry.
    3331             :  */
    3332             : static int
    3333     2887820 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
    3334             :              XLogRecPtr targetRecPtr, char *readBuf)
    3335             : {
    3336     2887820 :     XLogPageReadPrivate *private =
    3337             :         (XLogPageReadPrivate *) xlogreader->private_data;
    3338     2887820 :     int         emode = private->emode;
    3339             :     uint32      targetPageOff;
    3340             :     XLogSegNo   targetSegNo PG_USED_FOR_ASSERTS_ONLY;
    3341             :     int         r;
    3342             :     instr_time  io_start;
    3343             : 
    3344             :     Assert(AmStartupProcess() || !IsUnderPostmaster);
    3345             : 
    3346     2887820 :     XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
    3347     2887820 :     targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
    3348             : 
    3349             :     /*
    3350             :      * See if we need to switch to a new segment because the requested record
    3351             :      * is not in the currently open one.
    3352             :      */
    3353     2887820 :     if (readFile >= 0 &&
    3354     2884236 :         !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
    3355             :     {
    3356             :         /*
    3357             :          * Request a restartpoint if we've replayed too much xlog since the
    3358             :          * last one.
    3359             :          */
    3360        3046 :         if (ArchiveRecoveryRequested && IsUnderPostmaster)
    3361             :         {
    3362        3014 :             if (XLogCheckpointNeeded(readSegNo))
    3363             :             {
    3364        2784 :                 (void) GetRedoRecPtr();
    3365        2784 :                 if (XLogCheckpointNeeded(readSegNo))
    3366        2772 :                     RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
    3367             :             }
    3368             :         }
    3369             : 
    3370        3046 :         close(readFile);
    3371        3046 :         readFile = -1;
    3372        3046 :         readSource = XLOG_FROM_ANY;
    3373             :     }
    3374             : 
    3375     2887820 :     XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
    3376             : 
    3377     2887830 : retry:
    3378             :     /* See if we need to retrieve more data */
    3379     2887830 :     if (readFile < 0 ||
    3380     2881190 :         (readSource == XLOG_FROM_STREAM &&
    3381     2857126 :          flushedUpto < targetPagePtr + reqLen))
    3382             :     {
    3383       29686 :         if (readFile >= 0 &&
    3384       23046 :             xlogreader->nonblocking &&
    3385       11354 :             readSource == XLOG_FROM_STREAM &&
    3386       11354 :             flushedUpto < targetPagePtr + reqLen)
    3387       11354 :             return XLREAD_WOULDBLOCK;
    3388             : 
    3389       18214 :         switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
    3390       18332 :                                             private->randAccess,
    3391       18332 :                                             private->fetching_ckpt,
    3392             :                                             targetRecPtr,
    3393             :                                             private->replayTLI,
    3394             :                                             xlogreader->EndRecPtr,
    3395       18332 :                                             xlogreader->nonblocking))
    3396             :         {
    3397         958 :             case XLREAD_WOULDBLOCK:
    3398         958 :                 return XLREAD_WOULDBLOCK;
    3399          92 :             case XLREAD_FAIL:
    3400          92 :                 if (readFile >= 0)
    3401           0 :                     close(readFile);
    3402          92 :                 readFile = -1;
    3403          92 :                 readLen = 0;
    3404          92 :                 readSource = XLOG_FROM_ANY;
    3405          92 :                 return XLREAD_FAIL;
    3406       17164 :             case XLREAD_SUCCESS:
    3407       17164 :                 break;
    3408             :         }
    3409             :     }
    3410             : 
    3411             :     /*
    3412             :      * At this point, we have the right segment open and if we're streaming we
    3413             :      * know the requested record is in it.
    3414             :      */
    3415             :     Assert(readFile != -1);
    3416             : 
    3417             :     /*
    3418             :      * If the current segment is being streamed from the primary, calculate
    3419             :      * how much of the current page we have received already. We know the
    3420             :      * requested record has been received, but this is for the benefit of
    3421             :      * future calls, to allow quick exit at the top of this function.
    3422             :      */
    3423     2875308 :     if (readSource == XLOG_FROM_STREAM)
    3424             :     {
    3425     2847962 :         if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
    3426     2840474 :             readLen = XLOG_BLCKSZ;
    3427             :         else
    3428        7488 :             readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
    3429             :                 targetPageOff;
    3430             :     }
    3431             :     else
    3432       27346 :         readLen = XLOG_BLCKSZ;
    3433             : 
    3434             :     /* Read the requested page */
    3435     2875308 :     readOff = targetPageOff;
    3436             : 
    3437             :     /* Measure I/O timing when reading segment */
    3438     2875308 :     io_start = pgstat_prepare_io_time(track_wal_io_timing);
    3439             : 
    3440     2875308 :     pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
    3441     2875308 :     r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff);
    3442     2875308 :     if (r != XLOG_BLCKSZ)
    3443             :     {
    3444             :         char        fname[MAXFNAMELEN];
    3445           0 :         int         save_errno = errno;
    3446             : 
    3447           0 :         pgstat_report_wait_end();
    3448             : 
    3449           0 :         pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
    3450             :                                 io_start, 1, r);
    3451             : 
    3452           0 :         XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
    3453           0 :         if (r < 0)
    3454             :         {
    3455           0 :             errno = save_errno;
    3456           0 :             ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
    3457             :                     (errcode_for_file_access(),
    3458             :                      errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
    3459             :                             fname, LSN_FORMAT_ARGS(targetPagePtr),
    3460             :                             readOff)));
    3461             :         }
    3462             :         else
    3463           0 :             ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
    3464             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    3465             :                      errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
    3466             :                             fname, LSN_FORMAT_ARGS(targetPagePtr),
    3467             :                             readOff, r, (Size) XLOG_BLCKSZ)));
    3468           0 :         goto next_record_is_invalid;
    3469             :     }
    3470     2875308 :     pgstat_report_wait_end();
    3471             : 
    3472     2875308 :     pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
    3473             :                             io_start, 1, r);
    3474             : 
    3475             :     Assert(targetSegNo == readSegNo);
    3476             :     Assert(targetPageOff == readOff);
    3477             :     Assert(reqLen <= readLen);
    3478             : 
    3479     2875308 :     xlogreader->seg.ws_tli = curFileTLI;
    3480             : 
    3481             :     /*
    3482             :      * Check the page header immediately, so that we can retry immediately if
    3483             :      * it's not valid. This may seem unnecessary, because ReadPageInternal()
    3484             :      * validates the page header anyway, and would propagate the failure up to
    3485             :      * ReadRecord(), which would retry. However, there's a corner case with
    3486             :      * continuation records, if a record is split across two pages such that
    3487             :      * we would need to read the two pages from different sources across two
    3488             :      * WAL segments.
    3489             :      *
    3490             :      * The first page is only available locally, in pg_wal, because it's
    3491             :      * already been recycled on the primary. The second page, however, is not
    3492             :      * present in pg_wal, and we should stream it from the primary. There is a
    3493             :      * recycled WAL segment present in pg_wal, with garbage contents, however.
    3494             :      * We would read the first page from the local WAL segment, but when
    3495             :      * reading the second page, we would read the bogus, recycled, WAL
    3496             :      * segment. If we didn't catch that case here, we would never recover,
    3497             :      * because ReadRecord() would retry reading the whole record from the
    3498             :      * beginning.
    3499             :      *
    3500             :      * Of course, this only catches errors in the page header, which is what
    3501             :      * happens in the case of a recycled WAL segment. Other kinds of errors or
    3502             :      * corruption still has the same problem. But this at least fixes the
    3503             :      * common case, which can happen as part of normal operation.
    3504             :      *
    3505             :      * Validating the page header is cheap enough that doing it twice
    3506             :      * shouldn't be a big deal from a performance point of view.
    3507             :      *
    3508             :      * When not in standby mode, an invalid page header should cause recovery
    3509             :      * to end, not retry reading the page, so we don't need to validate the
    3510             :      * page header here for the retry. Instead, ReadPageInternal() is
    3511             :      * responsible for the validation.
    3512             :      */
    3513     2875308 :     if (StandbyMode &&
    3514     2855426 :         (targetPagePtr % wal_segment_size) == 0 &&
    3515        2816 :         !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
    3516             :     {
    3517             :         /*
    3518             :          * Emit this error right now then retry this page immediately. Use
    3519             :          * errmsg_internal() because the message was already translated.
    3520             :          */
    3521          12 :         if (xlogreader->errormsg_buf[0])
    3522          12 :             ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
    3523             :                     (errmsg_internal("%s", xlogreader->errormsg_buf)));
    3524             : 
    3525             :         /* reset any error XLogReaderValidatePageHeader() might have set */
    3526          12 :         XLogReaderResetError(xlogreader);
    3527          12 :         goto next_record_is_invalid;
    3528             :     }
    3529             : 
    3530     2875296 :     return readLen;
    3531             : 
    3532          12 : next_record_is_invalid:
    3533             : 
    3534             :     /*
    3535             :      * If we're reading ahead, give up fast.  Retries and error reporting will
    3536             :      * be handled by a later read when recovery catches up to this point.
    3537             :      */
    3538          12 :     if (xlogreader->nonblocking)
    3539           2 :         return XLREAD_WOULDBLOCK;
    3540             : 
    3541          10 :     lastSourceFailed = true;
    3542             : 
    3543          10 :     if (readFile >= 0)
    3544          10 :         close(readFile);
    3545          10 :     readFile = -1;
    3546          10 :     readLen = 0;
    3547          10 :     readSource = XLOG_FROM_ANY;
    3548             : 
    3549             :     /* In standby-mode, keep trying */
    3550          10 :     if (StandbyMode)
    3551          10 :         goto retry;
    3552             :     else
    3553           0 :         return XLREAD_FAIL;
    3554             : }
    3555             : 
    3556             : /*
    3557             :  * Open the WAL segment containing WAL location 'RecPtr'.
    3558             :  *
    3559             :  * The segment can be fetched via restore_command, or via walreceiver having
    3560             :  * streamed the record, or it can already be present in pg_wal. Checking
    3561             :  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
    3562             :  * too, in case someone copies a new segment directly to pg_wal. That is not
    3563             :  * documented or recommended, though.
    3564             :  *
    3565             :  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
    3566             :  * prepare to read WAL starting from RedoStartLSN after this.
    3567             :  *
    3568             :  * 'RecPtr' might not point to the beginning of the record we're interested
    3569             :  * in, it might also point to the page or segment header. In that case,
    3570             :  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
    3571             :  * used to decide which timeline to stream the requested WAL from.
    3572             :  *
    3573             :  * 'replayLSN' is the current replay LSN, so that if we scan for new
    3574             :  * timelines, we can reject a switch to a timeline that branched off before
    3575             :  * this point.
    3576             :  *
    3577             :  * If the record is not immediately available, the function returns false
    3578             :  * if we're not in standby mode. In standby mode, waits for it to become
    3579             :  * available.
    3580             :  *
    3581             :  * When the requested record becomes available, the function opens the file
    3582             :  * containing it (if not open already), and returns XLREAD_SUCCESS. When end
    3583             :  * of standby mode is triggered by the user, and there is no more WAL
    3584             :  * available, returns XLREAD_FAIL.
    3585             :  *
    3586             :  * If nonblocking is true, then give up immediately if we can't satisfy the
    3587             :  * request, returning XLREAD_WOULDBLOCK instead of waiting.
    3588             :  */
    3589             : static XLogPageReadResult
    3590       18332 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
    3591             :                             bool fetching_ckpt, XLogRecPtr tliRecPtr,
    3592             :                             TimeLineID replayTLI, XLogRecPtr replayLSN,
    3593             :                             bool nonblocking)
    3594             : {
    3595             :     static TimestampTz last_fail_time = 0;
    3596             :     TimestampTz now;
    3597       18332 :     bool        streaming_reply_sent = false;
    3598             : 
    3599             :     /*-------
    3600             :      * Standby mode is implemented by a state machine:
    3601             :      *
    3602             :      * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
    3603             :      *    pg_wal (XLOG_FROM_PG_WAL)
    3604             :      * 2. Check for promotion trigger request
    3605             :      * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
    3606             :      * 4. Rescan timelines
    3607             :      * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
    3608             :      *
    3609             :      * Failure to read from the current source advances the state machine to
    3610             :      * the next state.
    3611             :      *
    3612             :      * 'currentSource' indicates the current state. There are no currentSource
    3613             :      * values for "check trigger", "rescan timelines", and "sleep" states,
    3614             :      * those actions are taken when reading from the previous source fails, as
    3615             :      * part of advancing to the next state.
    3616             :      *
    3617             :      * If standby mode is turned off while reading WAL from stream, we move
    3618             :      * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
    3619             :      * the files (which would be required at end of recovery, e.g., timeline
    3620             :      * history file) from archive or pg_wal. We don't need to kill WAL receiver
    3621             :      * here because it's already stopped when standby mode is turned off at
    3622             :      * the end of recovery.
    3623             :      *-------
    3624             :      */
    3625       18332 :     if (!InArchiveRecovery)
    3626        1982 :         currentSource = XLOG_FROM_PG_WAL;
    3627       16350 :     else if (currentSource == XLOG_FROM_ANY ||
    3628       16104 :              (!StandbyMode && currentSource == XLOG_FROM_STREAM))
    3629             :     {
    3630         246 :         lastSourceFailed = false;
    3631         246 :         currentSource = XLOG_FROM_ARCHIVE;
    3632             :     }
    3633             : 
    3634             :     for (;;)
    3635       14984 :     {
    3636       33316 :         XLogSource  oldSource = currentSource;
    3637       33316 :         bool        startWalReceiver = false;
    3638             : 
    3639             :         /*
    3640             :          * First check if we failed to read from the current source, and
    3641             :          * advance the state machine if so. The failure to read might've
    3642             :          * happened outside this function, e.g when a CRC check fails on a
    3643             :          * record, or within this loop.
    3644             :          */
    3645       33316 :         if (lastSourceFailed)
    3646             :         {
    3647             :             /*
    3648             :              * Don't allow any retry loops to occur during nonblocking
    3649             :              * readahead.  Let the caller process everything that has been
    3650             :              * decoded already first.
    3651             :              */
    3652        1126 :             if (nonblocking)
    3653         156 :                 return XLREAD_WOULDBLOCK;
    3654             : 
    3655         970 :             switch (currentSource)
    3656             :             {
    3657         576 :                 case XLOG_FROM_ARCHIVE:
    3658             :                 case XLOG_FROM_PG_WAL:
    3659             : 
    3660             :                     /*
    3661             :                      * Check to see if promotion is requested. Note that we do
    3662             :                      * this only after failure, so when you promote, we still
    3663             :                      * finish replaying as much as we can from archive and
    3664             :                      * pg_wal before failover.
    3665             :                      */
    3666         576 :                     if (StandbyMode && CheckForStandbyTrigger())
    3667             :                     {
    3668          44 :                         XLogShutdownWalRcv();
    3669          44 :                         return XLREAD_FAIL;
    3670             :                     }
    3671             : 
    3672             :                     /*
    3673             :                      * Not in standby mode, and we've now tried the archive
    3674             :                      * and pg_wal.
    3675             :                      */
    3676         532 :                     if (!StandbyMode)
    3677          48 :                         return XLREAD_FAIL;
    3678             : 
    3679             :                     /*
    3680             :                      * Move to XLOG_FROM_STREAM state, and set to start a
    3681             :                      * walreceiver if necessary.
    3682             :                      */
    3683         484 :                     currentSource = XLOG_FROM_STREAM;
    3684         484 :                     startWalReceiver = true;
    3685         484 :                     break;
    3686             : 
    3687         394 :                 case XLOG_FROM_STREAM:
    3688             : 
    3689             :                     /*
    3690             :                      * Failure while streaming. Most likely, we got here
    3691             :                      * because streaming replication was terminated, or
    3692             :                      * promotion was triggered. But we also get here if we
    3693             :                      * find an invalid record in the WAL streamed from the
    3694             :                      * primary, in which case something is seriously wrong.
    3695             :                      * There's little chance that the problem will just go
    3696             :                      * away, but PANIC is not good for availability either,
    3697             :                      * especially in hot standby mode. So, we treat that the
    3698             :                      * same as disconnection, and retry from archive/pg_wal
    3699             :                      * again. The WAL in the archive should be identical to
    3700             :                      * what was streamed, so it's unlikely that it helps, but
    3701             :                      * one can hope...
    3702             :                      */
    3703             : 
    3704             :                     /*
    3705             :                      * We should be able to move to XLOG_FROM_STREAM only in
    3706             :                      * standby mode.
    3707             :                      */
    3708             :                     Assert(StandbyMode);
    3709             : 
    3710             :                     /*
    3711             :                      * Before we leave XLOG_FROM_STREAM state, make sure that
    3712             :                      * walreceiver is not active, so that it won't overwrite
    3713             :                      * WAL that we restore from archive.
    3714             :                      *
    3715             :                      * If walreceiver is actively streaming (or attempting to
    3716             :                      * connect), we must shut it down. However, if it's
    3717             :                      * already in WAITING state (e.g., due to timeline
    3718             :                      * divergence), we only need to reset the install flag to
    3719             :                      * allow archive restoration.
    3720             :                      */
    3721         394 :                     if (WalRcvStreaming())
    3722          68 :                         XLogShutdownWalRcv();
    3723             :                     else
    3724             :                     {
    3725             :                         /*
    3726             :                          * WALRCV_STOPPING state is a transient state while
    3727             :                          * the startup process is in ShutdownWalRcv().  It
    3728             :                          * should never appear here since we would be waiting
    3729             :                          * for the walreceiver to reach WALRCV_STOPPED in that
    3730             :                          * case.
    3731             :                          */
    3732             :                         Assert(WalRcvGetState() != WALRCV_STOPPING);
    3733         326 :                         ResetInstallXLogFileSegmentActive();
    3734             :                     }
    3735             : 
    3736             :                     /*
    3737             :                      * Before we sleep, re-scan for possible new timelines if
    3738             :                      * we were requested to recover to the latest timeline.
    3739             :                      */
    3740         394 :                     if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
    3741             :                     {
    3742         394 :                         if (rescanLatestTimeLine(replayTLI, replayLSN))
    3743             :                         {
    3744          14 :                             currentSource = XLOG_FROM_ARCHIVE;
    3745          14 :                             break;
    3746             :                         }
    3747             :                     }
    3748             : 
    3749             :                     /*
    3750             :                      * XLOG_FROM_STREAM is the last state in our state
    3751             :                      * machine, so we've exhausted all the options for
    3752             :                      * obtaining the requested WAL. We're going to loop back
    3753             :                      * and retry from the archive, but if it hasn't been long
    3754             :                      * since last attempt, sleep wal_retrieve_retry_interval
    3755             :                      * milliseconds to avoid busy-waiting.
    3756             :                      */
    3757         378 :                     now = GetCurrentTimestamp();
    3758         378 :                     if (!TimestampDifferenceExceeds(last_fail_time, now,
    3759             :                                                     wal_retrieve_retry_interval))
    3760             :                     {
    3761             :                         long        wait_time;
    3762             : 
    3763         444 :                         wait_time = wal_retrieve_retry_interval -
    3764         222 :                             TimestampDifferenceMilliseconds(last_fail_time, now);
    3765             : 
    3766         222 :                         elog(LOG, "waiting for WAL to become available at %X/%08X",
    3767             :                              LSN_FORMAT_ARGS(RecPtr));
    3768             : 
    3769             :                         /* Do background tasks that might benefit us later. */
    3770         222 :                         KnownAssignedTransactionIdsIdleMaintenance();
    3771             : 
    3772         222 :                         (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
    3773             :                                          WL_LATCH_SET | WL_TIMEOUT |
    3774             :                                          WL_EXIT_ON_PM_DEATH,
    3775             :                                          wait_time,
    3776             :                                          WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
    3777         222 :                         ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
    3778         222 :                         now = GetCurrentTimestamp();
    3779             : 
    3780             :                         /* Handle interrupt signals of startup process */
    3781         222 :                         ProcessStartupProcInterrupts();
    3782             :                     }
    3783         350 :                     last_fail_time = now;
    3784         350 :                     currentSource = XLOG_FROM_ARCHIVE;
    3785         350 :                     break;
    3786             : 
    3787           0 :                 default:
    3788           0 :                     elog(ERROR, "unexpected WAL source %d", currentSource);
    3789             :             }
    3790             :         }
    3791       32190 :         else if (currentSource == XLOG_FROM_PG_WAL)
    3792             :         {
    3793             :             /*
    3794             :              * We just successfully read a file in pg_wal. We prefer files in
    3795             :              * the archive over ones in pg_wal, so try the next file again
    3796             :              * from the archive first.
    3797             :              */
    3798        1974 :             if (InArchiveRecovery)
    3799           0 :                 currentSource = XLOG_FROM_ARCHIVE;
    3800             :         }
    3801             : 
    3802       33038 :         if (currentSource != oldSource)
    3803         848 :             elog(DEBUG2, "switched WAL source from %s to %s after %s",
    3804             :                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
    3805             :                  lastSourceFailed ? "failure" : "success");
    3806             : 
    3807             :         /*
    3808             :          * We've now handled possible failure. Try to read from the chosen
    3809             :          * source.
    3810             :          */
    3811       33038 :         lastSourceFailed = false;
    3812             : 
    3813       33038 :         switch (currentSource)
    3814             :         {
    3815        3676 :             case XLOG_FROM_ARCHIVE:
    3816             :             case XLOG_FROM_PG_WAL:
    3817             : 
    3818             :                 /*
    3819             :                  * WAL receiver must not be running when reading WAL from
    3820             :                  * archive or pg_wal.
    3821             :                  */
    3822             :                 Assert(!WalRcvStreaming());
    3823             : 
    3824             :                 /* Close any old file we might have open. */
    3825        3676 :                 if (readFile >= 0)
    3826             :                 {
    3827         174 :                     close(readFile);
    3828         174 :                     readFile = -1;
    3829             :                 }
    3830             :                 /* Reset curFileTLI if random fetch. */
    3831        3676 :                 if (randAccess)
    3832        2300 :                     curFileTLI = 0;
    3833             : 
    3834             :                 /*
    3835             :                  * Try to restore the file from archive, or read an existing
    3836             :                  * file from pg_wal.
    3837             :                  */
    3838        3676 :                 readFile = XLogFileReadAnyTLI(readSegNo,
    3839        3676 :                                               currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
    3840             :                                               currentSource);
    3841        3674 :                 if (readFile >= 0)
    3842        3282 :                     return XLREAD_SUCCESS;  /* success! */
    3843             : 
    3844             :                 /*
    3845             :                  * Nope, not found in archive or pg_wal.
    3846             :                  */
    3847         392 :                 lastSourceFailed = true;
    3848         392 :                 break;
    3849             : 
    3850       29362 :             case XLOG_FROM_STREAM:
    3851             :                 {
    3852             :                     bool        havedata;
    3853             : 
    3854             :                     /*
    3855             :                      * We should be able to move to XLOG_FROM_STREAM only in
    3856             :                      * standby mode.
    3857             :                      */
    3858             :                     Assert(StandbyMode);
    3859             : 
    3860             :                     /*
    3861             :                      * First, shutdown walreceiver if its restart has been
    3862             :                      * requested -- but no point if we're already slated for
    3863             :                      * starting it.
    3864             :                      */
    3865       29362 :                     if (pendingWalRcvRestart && !startWalReceiver)
    3866             :                     {
    3867          16 :                         XLogShutdownWalRcv();
    3868             : 
    3869             :                         /*
    3870             :                          * Re-scan for possible new timelines if we were
    3871             :                          * requested to recover to the latest timeline.
    3872             :                          */
    3873          16 :                         if (recoveryTargetTimeLineGoal ==
    3874             :                             RECOVERY_TARGET_TIMELINE_LATEST)
    3875          16 :                             rescanLatestTimeLine(replayTLI, replayLSN);
    3876             : 
    3877          16 :                         startWalReceiver = true;
    3878             :                     }
    3879       29362 :                     pendingWalRcvRestart = false;
    3880             : 
    3881             :                     /*
    3882             :                      * Launch walreceiver if needed.
    3883             :                      *
    3884             :                      * If fetching_ckpt is true, RecPtr points to the initial
    3885             :                      * checkpoint location. In that case, we use RedoStartLSN
    3886             :                      * as the streaming start position instead of RecPtr, so
    3887             :                      * that when we later jump backwards to start redo at
    3888             :                      * RedoStartLSN, we will have the logs streamed already.
    3889             :                      */
    3890       29362 :                     if (startWalReceiver &&
    3891         500 :                         PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
    3892             :                     {
    3893             :                         XLogRecPtr  ptr;
    3894             :                         TimeLineID  tli;
    3895             : 
    3896         402 :                         if (fetching_ckpt)
    3897             :                         {
    3898           0 :                             ptr = RedoStartLSN;
    3899           0 :                             tli = RedoStartTLI;
    3900             :                         }
    3901             :                         else
    3902             :                         {
    3903         402 :                             ptr = RecPtr;
    3904             : 
    3905             :                             /*
    3906             :                              * Use the record begin position to determine the
    3907             :                              * TLI, rather than the position we're reading.
    3908             :                              */
    3909         402 :                             tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
    3910             : 
    3911         402 :                             if (curFileTLI > 0 && tli < curFileTLI)
    3912           0 :                                 elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
    3913             :                                      LSN_FORMAT_ARGS(tliRecPtr),
    3914             :                                      tli, curFileTLI);
    3915             :                         }
    3916         402 :                         curFileTLI = tli;
    3917         402 :                         SetInstallXLogFileSegmentActive();
    3918         402 :                         RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
    3919             :                                              PrimarySlotName,
    3920             :                                              wal_receiver_create_temp_slot);
    3921         402 :                         flushedUpto = 0;
    3922             :                     }
    3923             : 
    3924             :                     /*
    3925             :                      * Check if WAL receiver is active or wait to start up.
    3926             :                      */
    3927       29362 :                     if (!WalRcvStreaming())
    3928             :                     {
    3929         326 :                         lastSourceFailed = true;
    3930         326 :                         break;
    3931             :                     }
    3932             : 
    3933             :                     /*
    3934             :                      * Walreceiver is active, so see if new data has arrived.
    3935             :                      *
    3936             :                      * We only advance XLogReceiptTime when we obtain fresh
    3937             :                      * WAL from walreceiver and observe that we had already
    3938             :                      * processed everything before the most recent "chunk"
    3939             :                      * that it flushed to disk.  In steady state where we are
    3940             :                      * keeping up with the incoming data, XLogReceiptTime will
    3941             :                      * be updated on each cycle. When we are behind,
    3942             :                      * XLogReceiptTime will not advance, so the grace time
    3943             :                      * allotted to conflicting queries will decrease.
    3944             :                      */
    3945       29036 :                     if (RecPtr < flushedUpto)
    3946        3608 :                         havedata = true;
    3947             :                     else
    3948             :                     {
    3949             :                         XLogRecPtr  latestChunkStart;
    3950             : 
    3951       25428 :                         flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
    3952       25428 :                         if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
    3953             :                         {
    3954       12702 :                             havedata = true;
    3955       12702 :                             if (latestChunkStart <= RecPtr)
    3956             :                             {
    3957        8682 :                                 XLogReceiptTime = GetCurrentTimestamp();
    3958        8682 :                                 SetCurrentChunkStartTime(XLogReceiptTime);
    3959             :                             }
    3960             :                         }
    3961             :                         else
    3962       12726 :                             havedata = false;
    3963             :                     }
    3964       29036 :                     if (havedata)
    3965             :                     {
    3966             :                         /*
    3967             :                          * Great, streamed far enough.  Open the file if it's
    3968             :                          * not open already.  Also read the timeline history
    3969             :                          * file if we haven't initialized timeline history
    3970             :                          * yet; it should be streamed over and present in
    3971             :                          * pg_wal by now.  Use XLOG_FROM_STREAM so that source
    3972             :                          * info is set correctly and XLogReceiptTime isn't
    3973             :                          * changed.
    3974             :                          *
    3975             :                          * NB: We must set readTimeLineHistory based on
    3976             :                          * recoveryTargetTLI, not receiveTLI. Normally they'll
    3977             :                          * be the same, but if recovery_target_timeline is
    3978             :                          * 'latest' and archiving is configured, then it's
    3979             :                          * possible that we managed to retrieve one or more
    3980             :                          * new timeline history files from the archive,
    3981             :                          * updating recoveryTargetTLI.
    3982             :                          */
    3983       16310 :                         if (readFile < 0)
    3984             :                         {
    3985        2428 :                             if (!expectedTLEs)
    3986           0 :                                 expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
    3987        2428 :                             readFile = XLogFileRead(readSegNo, receiveTLI,
    3988             :                                                     XLOG_FROM_STREAM, false);
    3989             :                             Assert(readFile >= 0);
    3990             :                         }
    3991             :                         else
    3992             :                         {
    3993             :                             /* just make sure source info is correct... */
    3994       13882 :                             readSource = XLOG_FROM_STREAM;
    3995       13882 :                             XLogReceiptSource = XLOG_FROM_STREAM;
    3996       13882 :                             return XLREAD_SUCCESS;
    3997             :                         }
    3998        2428 :                         break;
    3999             :                     }
    4000             : 
    4001             :                     /* In nonblocking mode, return rather than sleeping. */
    4002       12726 :                     if (nonblocking)
    4003         802 :                         return XLREAD_WOULDBLOCK;
    4004             : 
    4005             :                     /*
    4006             :                      * Data not here yet. Check for trigger, then wait for
    4007             :                      * walreceiver to wake us up when new WAL arrives.
    4008             :                      */
    4009       11924 :                     if (CheckForStandbyTrigger())
    4010             :                     {
    4011             :                         /*
    4012             :                          * Note that we don't return XLREAD_FAIL immediately
    4013             :                          * here. After being triggered, we still want to
    4014             :                          * replay all the WAL that was already streamed. It's
    4015             :                          * in pg_wal now, so we just treat this as a failure,
    4016             :                          * and the state machine will move on to replay the
    4017             :                          * streamed WAL from pg_wal, and then recheck the
    4018             :                          * trigger and exit replay.
    4019             :                          */
    4020          68 :                         lastSourceFailed = true;
    4021          68 :                         break;
    4022             :                     }
    4023             : 
    4024             :                     /*
    4025             :                      * Since we have replayed everything we have received so
    4026             :                      * far and are about to start waiting for more WAL, let's
    4027             :                      * tell the upstream server our replay location now so
    4028             :                      * that pg_stat_replication doesn't show stale
    4029             :                      * information.
    4030             :                      */
    4031       11856 :                     if (!streaming_reply_sent)
    4032             :                     {
    4033        9006 :                         WalRcvForceReply();
    4034        9006 :                         streaming_reply_sent = true;
    4035             :                     }
    4036             : 
    4037             :                     /* Do any background tasks that might benefit us later. */
    4038       11856 :                     KnownAssignedTransactionIdsIdleMaintenance();
    4039             : 
    4040             :                     /* Update pg_stat_recovery_prefetch before sleeping. */
    4041       11856 :                     XLogPrefetcherComputeStats(xlogprefetcher);
    4042             : 
    4043             :                     /*
    4044             :                      * Wait for more WAL to arrive, when we will be woken
    4045             :                      * immediately by the WAL receiver.
    4046             :                      */
    4047       11856 :                     (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
    4048             :                                      WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
    4049             :                                      -1L,
    4050             :                                      WAIT_EVENT_RECOVERY_WAL_STREAM);
    4051       11856 :                     ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
    4052       11856 :                     break;
    4053             :                 }
    4054             : 
    4055           0 :             default:
    4056           0 :                 elog(ERROR, "unexpected WAL source %d", currentSource);
    4057             :         }
    4058             : 
    4059             :         /*
    4060             :          * Check for recovery pause here so that we can confirm more quickly
    4061             :          * that a requested pause has actually taken effect.
    4062             :          */
    4063       15070 :         if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
    4064             :             RECOVERY_NOT_PAUSED)
    4065           6 :             recoveryPausesHere(false);
    4066             : 
    4067             :         /*
    4068             :          * This possibly-long loop needs to handle interrupts of startup
    4069             :          * process.
    4070             :          */
    4071       15070 :         ProcessStartupProcInterrupts();
    4072             :     }
    4073             : 
    4074             :     return XLREAD_FAIL;         /* not reached */
    4075             : }
    4076             : 
    4077             : 
    4078             : /*
    4079             :  * Determine what log level should be used to report a corrupt WAL record
    4080             :  * in the current WAL page, previously read by XLogPageRead().
    4081             :  *
    4082             :  * 'emode' is the error mode that would be used to report a file-not-found
    4083             :  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
    4084             :  * we're retrying the exact same record that we've tried previously, only
    4085             :  * complain the first time to keep the noise down.  However, we only do when
    4086             :  * reading from pg_wal, because we don't expect any invalid records in archive
    4087             :  * or in records streamed from the primary. Files in the archive should be complete,
    4088             :  * and we should never hit the end of WAL because we stop and wait for more WAL
    4089             :  * to arrive before replaying it.
    4090             :  *
    4091             :  * NOTE: This function remembers the RecPtr value it was last called with,
    4092             :  * to suppress repeated messages about the same record. Only call this when
    4093             :  * you are about to ereport(), or you might cause a later message to be
    4094             :  * erroneously suppressed.
    4095             :  */
    4096             : static int
    4097         566 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
    4098             : {
    4099             :     static XLogRecPtr lastComplaint = 0;
    4100             : 
    4101         566 :     if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
    4102             :     {
    4103         558 :         if (RecPtr == lastComplaint)
    4104         152 :             emode = DEBUG1;
    4105             :         else
    4106         406 :             lastComplaint = RecPtr;
    4107             :     }
    4108         566 :     return emode;
    4109             : }
    4110             : 
    4111             : 
    4112             : /*
    4113             :  * Subroutine to try to fetch and validate a prior checkpoint record.
    4114             :  */
    4115             : static XLogRecord *
    4116        1984 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
    4117             :                      TimeLineID replayTLI)
    4118             : {
    4119             :     XLogRecord *record;
    4120             :     uint8       info;
    4121             : 
    4122             :     Assert(xlogreader != NULL);
    4123             : 
    4124        1984 :     if (!XRecOffIsValid(RecPtr))
    4125             :     {
    4126           0 :         ereport(LOG,
    4127             :                 (errmsg("invalid checkpoint location")));
    4128           0 :         return NULL;
    4129             :     }
    4130             : 
    4131        1984 :     XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
    4132        1984 :     record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
    4133             : 
    4134        1984 :     if (record == NULL)
    4135             :     {
    4136           0 :         ereport(LOG,
    4137             :                 (errmsg("invalid checkpoint record")));
    4138           0 :         return NULL;
    4139             :     }
    4140        1984 :     if (record->xl_rmid != RM_XLOG_ID)
    4141             :     {
    4142           0 :         ereport(LOG,
    4143             :                 (errmsg("invalid resource manager ID in checkpoint record")));
    4144           0 :         return NULL;
    4145             :     }
    4146        1984 :     info = record->xl_info & ~XLR_INFO_MASK;
    4147        1984 :     if (info != XLOG_CHECKPOINT_SHUTDOWN &&
    4148             :         info != XLOG_CHECKPOINT_ONLINE)
    4149             :     {
    4150           0 :         ereport(LOG,
    4151             :                 (errmsg("invalid xl_info in checkpoint record")));
    4152           0 :         return NULL;
    4153             :     }
    4154        1984 :     if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
    4155             :     {
    4156           0 :         ereport(LOG,
    4157             :                 (errmsg("invalid length of checkpoint record")));
    4158           0 :         return NULL;
    4159             :     }
    4160        1984 :     return record;
    4161             : }
    4162             : 
    4163             : /*
    4164             :  * Scan for new timelines that might have appeared in the archive since we
    4165             :  * started recovery.
    4166             :  *
    4167             :  * If there are any, the function changes recovery target TLI to the latest
    4168             :  * one and returns 'true'.
    4169             :  */
    4170             : static bool
    4171         410 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
    4172             : {
    4173             :     List       *newExpectedTLEs;
    4174             :     bool        found;
    4175             :     ListCell   *cell;
    4176             :     TimeLineID  newtarget;
    4177         410 :     TimeLineID  oldtarget = recoveryTargetTLI;
    4178         410 :     TimeLineHistoryEntry *currentTle = NULL;
    4179             : 
    4180         410 :     newtarget = findNewestTimeLine(recoveryTargetTLI);
    4181         408 :     if (newtarget == recoveryTargetTLI)
    4182             :     {
    4183             :         /* No new timelines found */
    4184         394 :         return false;
    4185             :     }
    4186             : 
    4187             :     /*
    4188             :      * Determine the list of expected TLIs for the new TLI
    4189             :      */
    4190             : 
    4191          14 :     newExpectedTLEs = readTimeLineHistory(newtarget);
    4192             : 
    4193             :     /*
    4194             :      * If the current timeline is not part of the history of the new timeline,
    4195             :      * we cannot proceed to it.
    4196             :      */
    4197          14 :     found = false;
    4198          28 :     foreach(cell, newExpectedTLEs)
    4199             :     {
    4200          28 :         currentTle = (TimeLineHistoryEntry *) lfirst(cell);
    4201             : 
    4202          28 :         if (currentTle->tli == recoveryTargetTLI)
    4203             :         {
    4204          14 :             found = true;
    4205          14 :             break;
    4206             :         }
    4207             :     }
    4208          14 :     if (!found)
    4209             :     {
    4210           0 :         ereport(LOG,
    4211             :                 (errmsg("new timeline %u is not a child of database system timeline %u",
    4212             :                         newtarget,
    4213             :                         replayTLI)));
    4214           0 :         return false;
    4215             :     }
    4216             : 
    4217             :     /*
    4218             :      * The current timeline was found in the history file, but check that the
    4219             :      * next timeline was forked off from it *after* the current recovery
    4220             :      * location.
    4221             :      */
    4222          14 :     if (currentTle->end < replayLSN)
    4223             :     {
    4224           0 :         ereport(LOG,
    4225             :                 errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
    4226             :                        newtarget,
    4227             :                        replayTLI,
    4228             :                        LSN_FORMAT_ARGS(replayLSN)));
    4229           0 :         return false;
    4230             :     }
    4231             : 
    4232             :     /* The new timeline history seems valid. Switch target */
    4233          14 :     recoveryTargetTLI = newtarget;
    4234          14 :     list_free_deep(expectedTLEs);
    4235          14 :     expectedTLEs = newExpectedTLEs;
    4236             : 
    4237             :     /*
    4238             :      * As in StartupXLOG(), try to ensure we have all the history files
    4239             :      * between the old target and new target in pg_wal.
    4240             :      */
    4241          14 :     restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
    4242             : 
    4243          14 :     ereport(LOG,
    4244             :             (errmsg("new target timeline is %u",
    4245             :                     recoveryTargetTLI)));
    4246             : 
    4247          14 :     return true;
    4248             : }
    4249             : 
    4250             : 
    4251             : /*
    4252             :  * Open a logfile segment for reading (during recovery).
    4253             :  *
    4254             :  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
    4255             :  * Otherwise, it's assumed to be already available in pg_wal.
    4256             :  */
    4257             : static int
    4258        7116 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
    4259             :              XLogSource source, bool notfoundOk)
    4260             : {
    4261             :     char        xlogfname[MAXFNAMELEN];
    4262             :     char        activitymsg[MAXFNAMELEN + 16];
    4263             :     char        path[MAXPGPATH];
    4264             :     int         fd;
    4265             : 
    4266        7116 :     XLogFileName(xlogfname, tli, segno, wal_segment_size);
    4267             : 
    4268        7116 :     switch (source)
    4269             :     {
    4270        1720 :         case XLOG_FROM_ARCHIVE:
    4271             :             /* Report recovery progress in PS display */
    4272        1720 :             snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
    4273             :                      xlogfname);
    4274        1720 :             set_ps_display(activitymsg);
    4275             : 
    4276        1720 :             if (!RestoreArchivedFile(path, xlogfname,
    4277             :                                      "RECOVERYXLOG",
    4278             :                                      wal_segment_size,
    4279             :                                      InRedo))
    4280         994 :                 return -1;
    4281         724 :             break;
    4282             : 
    4283        5396 :         case XLOG_FROM_PG_WAL:
    4284             :         case XLOG_FROM_STREAM:
    4285        5396 :             XLogFilePath(path, tli, segno, wal_segment_size);
    4286        5396 :             break;
    4287             : 
    4288           0 :         default:
    4289           0 :             elog(ERROR, "invalid XLogFileRead source %d", source);
    4290             :     }
    4291             : 
    4292             :     /*
    4293             :      * If the segment was fetched from archival storage, replace the existing
    4294             :      * xlog segment (if any) with the archival version.
    4295             :      */
    4296        6120 :     if (source == XLOG_FROM_ARCHIVE)
    4297             :     {
    4298             :         Assert(!IsInstallXLogFileSegmentActive());
    4299         724 :         KeepFileRestoredFromArchive(path, xlogfname);
    4300             : 
    4301             :         /*
    4302             :          * Set path to point at the new file in pg_wal.
    4303             :          */
    4304         724 :         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
    4305             :     }
    4306             : 
    4307        6120 :     fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
    4308        6120 :     if (fd >= 0)
    4309             :     {
    4310             :         /* Success! */
    4311        5710 :         curFileTLI = tli;
    4312             : 
    4313             :         /* Report recovery progress in PS display */
    4314        5710 :         snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
    4315             :                  xlogfname);
    4316        5710 :         set_ps_display(activitymsg);
    4317             : 
    4318             :         /* Track source of data in assorted state variables */
    4319        5710 :         readSource = source;
    4320        5710 :         XLogReceiptSource = source;
    4321             :         /* In FROM_STREAM case, caller tracks receipt time, not me */
    4322        5710 :         if (source != XLOG_FROM_STREAM)
    4323        3282 :             XLogReceiptTime = GetCurrentTimestamp();
    4324             : 
    4325        5710 :         return fd;
    4326             :     }
    4327         410 :     if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
    4328           0 :         ereport(PANIC,
    4329             :                 (errcode_for_file_access(),
    4330             :                  errmsg("could not open file \"%s\": %m", path)));
    4331         410 :     return -1;
    4332             : }
    4333             : 
    4334             : /*
    4335             :  * Open a logfile segment for reading (during recovery).
    4336             :  *
    4337             :  * This version searches for the segment with any TLI listed in expectedTLEs.
    4338             :  */
    4339             : static int
    4340        3676 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
    4341             : {
    4342             :     char        path[MAXPGPATH];
    4343             :     ListCell   *cell;
    4344             :     int         fd;
    4345             :     List       *tles;
    4346             : 
    4347             :     /*
    4348             :      * Loop looking for a suitable timeline ID: we might need to read any of
    4349             :      * the timelines listed in expectedTLEs.
    4350             :      *
    4351             :      * We expect curFileTLI on entry to be the TLI of the preceding file in
    4352             :      * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
    4353             :      * to go backwards; this prevents us from picking up the wrong file when a
    4354             :      * parent timeline extends to higher segment numbers than the child we
    4355             :      * want to read.
    4356             :      *
    4357             :      * If we haven't read the timeline history file yet, read it now, so that
    4358             :      * we know which TLIs to scan.  We don't save the list in expectedTLEs,
    4359             :      * however, unless we actually find a valid segment.  That way if there is
    4360             :      * neither a timeline history file nor a WAL segment in the archive, and
    4361             :      * streaming replication is set up, we'll read the timeline history file
    4362             :      * streamed from the primary when we start streaming, instead of
    4363             :      * recovering with a dummy history generated here.
    4364             :      */
    4365        3676 :     if (expectedTLEs)
    4366        1692 :         tles = expectedTLEs;
    4367             :     else
    4368        1984 :         tles = readTimeLineHistory(recoveryTargetTLI);
    4369             : 
    4370        4100 :     foreach(cell, tles)
    4371             :     {
    4372        3720 :         TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
    4373        3720 :         TimeLineID  tli = hent->tli;
    4374             : 
    4375        3720 :         if (tli < curFileTLI)
    4376          12 :             break;              /* don't bother looking at too-old TLIs */
    4377             : 
    4378             :         /*
    4379             :          * Skip scanning the timeline ID that the logfile segment to read
    4380             :          * doesn't belong to
    4381             :          */
    4382        3708 :         if (XLogRecPtrIsValid(hent->begin))
    4383             :         {
    4384         156 :             XLogSegNo   beginseg = 0;
    4385             : 
    4386         156 :             XLByteToSeg(hent->begin, beginseg, wal_segment_size);
    4387             : 
    4388             :             /*
    4389             :              * The logfile segment that doesn't belong to the timeline is
    4390             :              * older or newer than the segment that the timeline started or
    4391             :              * ended at, respectively. It's sufficient to check only the
    4392             :              * starting segment of the timeline here. Since the timelines are
    4393             :              * scanned in descending order in this loop, any segments newer
    4394             :              * than the ending segment should belong to newer timeline and
    4395             :              * have already been read before. So it's not necessary to check
    4396             :              * the ending segment of the timeline here.
    4397             :              */
    4398         156 :             if (segno < beginseg)
    4399          14 :                 continue;
    4400             :         }
    4401             : 
    4402        3694 :         if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
    4403             :         {
    4404        1720 :             fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
    4405        1718 :             if (fd != -1)
    4406             :             {
    4407         724 :                 elog(DEBUG1, "got WAL segment from archive");
    4408         724 :                 if (!expectedTLEs)
    4409          36 :                     expectedTLEs = tles;
    4410        3282 :                 return fd;
    4411             :             }
    4412             :         }
    4413             : 
    4414        2968 :         if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
    4415             :         {
    4416        2968 :             fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
    4417        2968 :             if (fd != -1)
    4418             :             {
    4419        2558 :                 if (!expectedTLEs)
    4420        1948 :                     expectedTLEs = tles;
    4421        2558 :                 return fd;
    4422             :             }
    4423             :         }
    4424             :     }
    4425             : 
    4426             :     /* Couldn't find it.  For simplicity, complain about front timeline */
    4427         392 :     XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
    4428         392 :     errno = ENOENT;
    4429         392 :     ereport(DEBUG2,
    4430             :             (errcode_for_file_access(),
    4431             :              errmsg("could not open file \"%s\": %m", path)));
    4432         392 :     return -1;
    4433             : }
    4434             : 
    4435             : /*
    4436             :  * Set flag to signal the walreceiver to restart.  (The startup process calls
    4437             :  * this on noticing a relevant configuration change.)
    4438             :  */
    4439             : void
    4440          24 : StartupRequestWalReceiverRestart(void)
    4441             : {
    4442          24 :     if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
    4443             :     {
    4444          16 :         ereport(LOG,
    4445             :                 (errmsg("WAL receiver process shutdown requested")));
    4446             : 
    4447          16 :         pendingWalRcvRestart = true;
    4448             :     }
    4449          24 : }
    4450             : 
    4451             : 
    4452             : /*
    4453             :  * Has a standby promotion already been triggered?
    4454             :  *
    4455             :  * Unlike CheckForStandbyTrigger(), this works in any process
    4456             :  * that's connected to shared memory.
    4457             :  */
    4458             : bool
    4459         142 : PromoteIsTriggered(void)
    4460             : {
    4461             :     /*
    4462             :      * We check shared state each time only until a standby promotion is
    4463             :      * triggered. We can't trigger a promotion again, so there's no need to
    4464             :      * keep checking after the shared variable has once been seen true.
    4465             :      */
    4466         142 :     if (LocalPromoteIsTriggered)
    4467         100 :         return true;
    4468             : 
    4469          42 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4470          42 :     LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
    4471          42 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4472             : 
    4473          42 :     return LocalPromoteIsTriggered;
    4474             : }
    4475             : 
    4476             : static void
    4477          94 : SetPromoteIsTriggered(void)
    4478             : {
    4479          94 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4480          94 :     XLogRecoveryCtl->SharedPromoteIsTriggered = true;
    4481          94 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4482             : 
    4483             :     /*
    4484             :      * Mark the recovery pause state as 'not paused' because the paused state
    4485             :      * ends and promotion continues if a promotion is triggered while recovery
    4486             :      * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
    4487             :      * return 'paused' while a promotion is ongoing.
    4488             :      */
    4489          94 :     SetRecoveryPause(false);
    4490             : 
    4491          94 :     LocalPromoteIsTriggered = true;
    4492          94 : }
    4493             : 
    4494             : /*
    4495             :  * Check whether a promote request has arrived.
    4496             :  */
    4497             : static bool
    4498       13138 : CheckForStandbyTrigger(void)
    4499             : {
    4500       13138 :     if (LocalPromoteIsTriggered)
    4501         114 :         return true;
    4502             : 
    4503       13024 :     if (IsPromoteSignaled() && CheckPromoteSignal())
    4504             :     {
    4505          94 :         ereport(LOG, (errmsg("received promote request")));
    4506          94 :         RemovePromoteSignalFiles();
    4507          94 :         ResetPromoteSignaled();
    4508          94 :         SetPromoteIsTriggered();
    4509          94 :         return true;
    4510             :     }
    4511             : 
    4512       12930 :     return false;
    4513             : }
    4514             : 
    4515             : /*
    4516             :  * Remove the files signaling a standby promotion request.
    4517             :  */
    4518             : void
    4519        1910 : RemovePromoteSignalFiles(void)
    4520             : {
    4521        1910 :     unlink(PROMOTE_SIGNAL_FILE);
    4522        1910 : }
    4523             : 
    4524             : /*
    4525             :  * Check to see if a promote request has arrived.
    4526             :  */
    4527             : bool
    4528        1742 : CheckPromoteSignal(void)
    4529             : {
    4530             :     struct stat stat_buf;
    4531             : 
    4532        1742 :     if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
    4533         190 :         return true;
    4534             : 
    4535        1552 :     return false;
    4536             : }
    4537             : 
    4538             : /*
    4539             :  * Wake up startup process to replay newly arrived WAL, or to notice that
    4540             :  * failover has been requested.
    4541             :  */
    4542             : void
    4543       68972 : WakeupRecovery(void)
    4544             : {
    4545       68972 :     SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
    4546       68972 : }
    4547             : 
    4548             : /*
    4549             :  * Schedule a walreceiver wakeup in the main recovery loop.
    4550             :  */
    4551             : void
    4552           4 : XLogRequestWalReceiverReply(void)
    4553             : {
    4554           4 :     doRequestWalReceiverReply = true;
    4555           4 : }
    4556             : 
    4557             : /*
    4558             :  * Is HotStandby active yet? This is only important in special backends
    4559             :  * since normal backends won't ever be able to connect until this returns
    4560             :  * true. Postmaster knows this by way of signal, not via shared memory.
    4561             :  *
    4562             :  * Unlike testing standbyState, this works in any process that's connected to
    4563             :  * shared memory.  (And note that standbyState alone doesn't tell the truth
    4564             :  * anyway.)
    4565             :  */
    4566             : bool
    4567         328 : HotStandbyActive(void)
    4568             : {
    4569             :     /*
    4570             :      * We check shared state each time only until Hot Standby is active. We
    4571             :      * can't de-activate Hot Standby, so there's no need to keep checking
    4572             :      * after the shared variable has once been seen true.
    4573             :      */
    4574         328 :     if (LocalHotStandbyActive)
    4575          50 :         return true;
    4576             :     else
    4577             :     {
    4578             :         /* spinlock is essential on machines with weak memory ordering! */
    4579         278 :         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4580         278 :         LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
    4581         278 :         SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4582             : 
    4583         278 :         return LocalHotStandbyActive;
    4584             :     }
    4585             : }
    4586             : 
    4587             : /*
    4588             :  * Like HotStandbyActive(), but to be used only in WAL replay code,
    4589             :  * where we don't need to ask any other process what the state is.
    4590             :  */
    4591             : static bool
    4592           0 : HotStandbyActiveInReplay(void)
    4593             : {
    4594             :     Assert(AmStartupProcess() || !IsPostmasterEnvironment);
    4595           0 :     return LocalHotStandbyActive;
    4596             : }
    4597             : 
    4598             : /*
    4599             :  * Get latest redo apply position.
    4600             :  *
    4601             :  * Exported to allow WALReceiver to read the pointer directly.
    4602             :  */
    4603             : XLogRecPtr
    4604      172254 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
    4605             : {
    4606             :     XLogRecPtr  recptr;
    4607             :     TimeLineID  tli;
    4608             : 
    4609      172254 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4610      172254 :     recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
    4611      172254 :     tli = XLogRecoveryCtl->lastReplayedTLI;
    4612      172254 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4613             : 
    4614      172254 :     if (replayTLI)
    4615        7242 :         *replayTLI = tli;
    4616      172254 :     return recptr;
    4617             : }
    4618             : 
    4619             : 
    4620             : /*
    4621             :  * Get position of last applied, or the record being applied.
    4622             :  *
    4623             :  * This is different from GetXLogReplayRecPtr() in that if a WAL
    4624             :  * record is currently being applied, this includes that record.
    4625             :  */
    4626             : XLogRecPtr
    4627       12486 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
    4628             : {
    4629             :     XLogRecPtr  recptr;
    4630             :     TimeLineID  tli;
    4631             : 
    4632       12486 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4633       12486 :     recptr = XLogRecoveryCtl->replayEndRecPtr;
    4634       12486 :     tli = XLogRecoveryCtl->replayEndTLI;
    4635       12486 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4636             : 
    4637       12486 :     if (replayEndTLI)
    4638       12486 :         *replayEndTLI = tli;
    4639       12486 :     return recptr;
    4640             : }
    4641             : 
    4642             : /*
    4643             :  * Save timestamp of latest processed commit/abort record.
    4644             :  *
    4645             :  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
    4646             :  * seen by processes other than the startup process.  Note in particular
    4647             :  * that CreateRestartPoint is executed in the checkpointer.
    4648             :  */
    4649             : static void
    4650       44412 : SetLatestXTime(TimestampTz xtime)
    4651             : {
    4652       44412 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4653       44412 :     XLogRecoveryCtl->recoveryLastXTime = xtime;
    4654       44412 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4655       44412 : }
    4656             : 
    4657             : /*
    4658             :  * Fetch timestamp of latest processed commit/abort record.
    4659             :  */
    4660             : TimestampTz
    4661         696 : GetLatestXTime(void)
    4662             : {
    4663             :     TimestampTz xtime;
    4664             : 
    4665         696 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4666         696 :     xtime = XLogRecoveryCtl->recoveryLastXTime;
    4667         696 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4668             : 
    4669         696 :     return xtime;
    4670             : }
    4671             : 
    4672             : /*
    4673             :  * Save timestamp of the next chunk of WAL records to apply.
    4674             :  *
    4675             :  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
    4676             :  * seen by all backends.
    4677             :  */
    4678             : static void
    4679        8682 : SetCurrentChunkStartTime(TimestampTz xtime)
    4680             : {
    4681        8682 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4682        8682 :     XLogRecoveryCtl->currentChunkStartTime = xtime;
    4683        8682 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4684        8682 : }
    4685             : 
    4686             : /*
    4687             :  * Fetch timestamp of latest processed commit/abort record.
    4688             :  * Startup process maintains an accurate local copy in XLogReceiptTime
    4689             :  */
    4690             : TimestampTz
    4691         604 : GetCurrentChunkReplayStartTime(void)
    4692             : {
    4693             :     TimestampTz xtime;
    4694             : 
    4695         604 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4696         604 :     xtime = XLogRecoveryCtl->currentChunkStartTime;
    4697         604 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4698             : 
    4699         604 :     return xtime;
    4700             : }
    4701             : 
    4702             : /*
    4703             :  * Returns time of receipt of current chunk of XLOG data, as well as
    4704             :  * whether it was received from streaming replication or from archives.
    4705             :  */
    4706             : void
    4707          54 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
    4708             : {
    4709             :     /*
    4710             :      * This must be executed in the startup process, since we don't export the
    4711             :      * relevant state to shared memory.
    4712             :      */
    4713             :     Assert(InRecovery);
    4714             : 
    4715          54 :     *rtime = XLogReceiptTime;
    4716          54 :     *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
    4717          54 : }
    4718             : 
    4719             : /*
    4720             :  * Note that text field supplied is a parameter name and does not require
    4721             :  * translation
    4722             :  */
    4723             : void
    4724        1360 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
    4725             : {
    4726        1360 :     if (currValue < minValue)
    4727             :     {
    4728           0 :         if (HotStandbyActiveInReplay())
    4729             :         {
    4730           0 :             bool        warned_for_promote = false;
    4731             : 
    4732           0 :             ereport(WARNING,
    4733             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4734             :                      errmsg("hot standby is not possible because of insufficient parameter settings"),
    4735             :                      errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
    4736             :                                param_name,
    4737             :                                currValue,
    4738             :                                minValue)));
    4739             : 
    4740           0 :             SetRecoveryPause(true);
    4741             : 
    4742           0 :             ereport(LOG,
    4743             :                     (errmsg("recovery has paused"),
    4744             :                      errdetail("If recovery is unpaused, the server will shut down."),
    4745             :                      errhint("You can then restart the server after making the necessary configuration changes.")));
    4746             : 
    4747           0 :             while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
    4748             :             {
    4749           0 :                 ProcessStartupProcInterrupts();
    4750             : 
    4751           0 :                 if (CheckForStandbyTrigger())
    4752             :                 {
    4753           0 :                     if (!warned_for_promote)
    4754           0 :                         ereport(WARNING,
    4755             :                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4756             :                                  errmsg("promotion is not possible because of insufficient parameter settings"),
    4757             : 
    4758             :                         /*
    4759             :                          * Repeat the detail from above so it's easy to find
    4760             :                          * in the log.
    4761             :                          */
    4762             :                                  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
    4763             :                                            param_name,
    4764             :                                            currValue,
    4765             :                                            minValue),
    4766             :                                  errhint("Restart the server after making the necessary configuration changes.")));
    4767           0 :                     warned_for_promote = true;
    4768             :                 }
    4769             : 
    4770             :                 /*
    4771             :                  * If recovery pause is requested then set it paused.  While
    4772             :                  * we are in the loop, user might resume and pause again so
    4773             :                  * set this every time.
    4774             :                  */
    4775           0 :                 ConfirmRecoveryPaused();
    4776             : 
    4777             :                 /*
    4778             :                  * We wait on a condition variable that will wake us as soon
    4779             :                  * as the pause ends, but we use a timeout so we can check the
    4780             :                  * above conditions periodically too.
    4781             :                  */
    4782           0 :                 ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
    4783             :                                             WAIT_EVENT_RECOVERY_PAUSE);
    4784             :             }
    4785           0 :             ConditionVariableCancelSleep();
    4786             :         }
    4787             : 
    4788           0 :         ereport(FATAL,
    4789             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4790             :                  errmsg("recovery aborted because of insufficient parameter settings"),
    4791             :         /* Repeat the detail from above so it's easy to find in the log. */
    4792             :                  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
    4793             :                            param_name,
    4794             :                            currValue,
    4795             :                            minValue),
    4796             :                  errhint("You can restart the server after making the necessary configuration changes.")));
    4797             :     }
    4798        1360 : }
    4799             : 
    4800             : 
    4801             : /*
    4802             :  * GUC check_hook for primary_slot_name
    4803             :  */
    4804             : bool
    4805        2752 : check_primary_slot_name(char **newval, void **extra, GucSource source)
    4806             : {
    4807             :     int         err_code;
    4808        2752 :     char       *err_msg = NULL;
    4809        2752 :     char       *err_hint = NULL;
    4810             : 
    4811        2752 :     if (*newval && strcmp(*newval, "") != 0 &&
    4812         402 :         !ReplicationSlotValidateNameInternal(*newval, false, &err_code,
    4813             :                                              &err_msg, &err_hint))
    4814             :     {
    4815           0 :         GUC_check_errcode(err_code);
    4816           0 :         GUC_check_errdetail("%s", err_msg);
    4817           0 :         if (err_hint != NULL)
    4818           0 :             GUC_check_errhint("%s", err_hint);
    4819           0 :         return false;
    4820             :     }
    4821             : 
    4822        2752 :     return true;
    4823             : }
    4824             : 
    4825             : /*
    4826             :  * Recovery target settings: Only one of the several recovery_target* settings
    4827             :  * may be set.  Setting a second one results in an error.  The global variable
    4828             :  * recoveryTarget tracks which kind of recovery target was chosen.  Other
    4829             :  * variables store the actual target value (for example a string or a xid).
    4830             :  * The assign functions of the parameters check whether a competing parameter
    4831             :  * was already set.  But we want to allow setting the same parameter multiple
    4832             :  * times.  We also want to allow unsetting a parameter and setting a different
    4833             :  * one, so we unset recoveryTarget when the parameter is set to an empty
    4834             :  * string.
    4835             :  *
    4836             :  * XXX this code is broken by design.  Throwing an error from a GUC assign
    4837             :  * hook breaks fundamental assumptions of guc.c.  So long as all the variables
    4838             :  * for which this can happen are PGC_POSTMASTER, the consequences are limited,
    4839             :  * since we'd just abort postmaster startup anyway.  Nonetheless it's likely
    4840             :  * that we have odd behaviors such as unexpected GUC ordering dependencies.
    4841             :  */
    4842             : 
    4843             : pg_noreturn static void
    4844           2 : error_multiple_recovery_targets(void)
    4845             : {
    4846           2 :     ereport(ERROR,
    4847             :             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4848             :              errmsg("multiple recovery targets specified"),
    4849             :              errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
    4850             : }
    4851             : 
    4852             : /*
    4853             :  * GUC check_hook for recovery_target
    4854             :  */
    4855             : bool
    4856        2350 : check_recovery_target(char **newval, void **extra, GucSource source)
    4857             : {
    4858        2350 :     if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
    4859             :     {
    4860           0 :         GUC_check_errdetail("The only allowed value is \"immediate\".");
    4861           0 :         return false;
    4862             :     }
    4863        2350 :     return true;
    4864             : }
    4865             : 
    4866             : /*
    4867             :  * GUC assign_hook for recovery_target
    4868             :  */
    4869             : void
    4870        2350 : assign_recovery_target(const char *newval, void *extra)
    4871             : {
    4872        2350 :     if (recoveryTarget != RECOVERY_TARGET_UNSET &&
    4873           0 :         recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
    4874           0 :         error_multiple_recovery_targets();
    4875             : 
    4876        2350 :     if (newval && strcmp(newval, "") != 0)
    4877           2 :         recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
    4878             :     else
    4879        2348 :         recoveryTarget = RECOVERY_TARGET_UNSET;
    4880        2350 : }
    4881             : 
    4882             : /*
    4883             :  * GUC check_hook for recovery_target_lsn
    4884             :  */
    4885             : bool
    4886        2362 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
    4887             : {
    4888        2362 :     if (strcmp(*newval, "") != 0)
    4889             :     {
    4890             :         XLogRecPtr  lsn;
    4891             :         XLogRecPtr *myextra;
    4892          16 :         ErrorSaveContext escontext = {T_ErrorSaveContext};
    4893             : 
    4894          16 :         lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
    4895          16 :         if (escontext.error_occurred)
    4896           0 :             return false;
    4897             : 
    4898          16 :         myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
    4899          16 :         if (!myextra)
    4900           0 :             return false;
    4901          16 :         *myextra = lsn;
    4902          16 :         *extra = myextra;
    4903             :     }
    4904        2362 :     return true;
    4905             : }
    4906             : 
    4907             : /*
    4908             :  * GUC assign_hook for recovery_target_lsn
    4909             :  */
    4910             : void
    4911        2362 : assign_recovery_target_lsn(const char *newval, void *extra)
    4912             : {
    4913        2362 :     if (recoveryTarget != RECOVERY_TARGET_UNSET &&
    4914           0 :         recoveryTarget != RECOVERY_TARGET_LSN)
    4915           0 :         error_multiple_recovery_targets();
    4916             : 
    4917        2362 :     if (newval && strcmp(newval, "") != 0)
    4918             :     {
    4919          16 :         recoveryTarget = RECOVERY_TARGET_LSN;
    4920          16 :         recoveryTargetLSN = *((XLogRecPtr *) extra);
    4921             :     }
    4922             :     else
    4923        2346 :         recoveryTarget = RECOVERY_TARGET_UNSET;
    4924        2362 : }
    4925             : 
    4926             : /*
    4927             :  * GUC check_hook for recovery_target_name
    4928             :  */
    4929             : bool
    4930        2362 : check_recovery_target_name(char **newval, void **extra, GucSource source)
    4931             : {
    4932             :     /* Use the value of newval directly */
    4933        2362 :     if (strlen(*newval) >= MAXFNAMELEN)
    4934             :     {
    4935           0 :         GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
    4936             :                             "recovery_target_name", MAXFNAMELEN - 1);
    4937           0 :         return false;
    4938             :     }
    4939        2362 :     return true;
    4940             : }
    4941             : 
    4942             : /*
    4943             :  * GUC assign_hook for recovery_target_name
    4944             :  */
    4945             : void
    4946        2362 : assign_recovery_target_name(const char *newval, void *extra)
    4947             : {
    4948        2362 :     if (recoveryTarget != RECOVERY_TARGET_UNSET &&
    4949           0 :         recoveryTarget != RECOVERY_TARGET_NAME)
    4950           0 :         error_multiple_recovery_targets();
    4951             : 
    4952        2362 :     if (newval && strcmp(newval, "") != 0)
    4953             :     {
    4954          12 :         recoveryTarget = RECOVERY_TARGET_NAME;
    4955          12 :         recoveryTargetName = newval;
    4956             :     }
    4957             :     else
    4958        2350 :         recoveryTarget = RECOVERY_TARGET_UNSET;
    4959        2362 : }
    4960             : 
    4961             : /*
    4962             :  * GUC check_hook for recovery_target_time
    4963             :  *
    4964             :  * The interpretation of the recovery_target_time string can depend on the
    4965             :  * time zone setting, so we need to wait until after all GUC processing is
    4966             :  * done before we can do the final parsing of the string.  This check function
    4967             :  * only does a parsing pass to catch syntax errors, but we store the string
    4968             :  * and parse it again when we need to use it.
    4969             :  */
    4970             : bool
    4971        2354 : check_recovery_target_time(char **newval, void **extra, GucSource source)
    4972             : {
    4973        2354 :     if (strcmp(*newval, "") != 0)
    4974             :     {
    4975             :         /* reject some special values */
    4976           6 :         if (strcmp(*newval, "now") == 0 ||
    4977           6 :             strcmp(*newval, "today") == 0 ||
    4978           6 :             strcmp(*newval, "tomorrow") == 0 ||
    4979           6 :             strcmp(*newval, "yesterday") == 0)
    4980             :         {
    4981           0 :             return false;
    4982             :         }
    4983             : 
    4984             :         /*
    4985             :          * parse timestamp value (see also timestamptz_in())
    4986             :          */
    4987             :         {
    4988           6 :             char       *str = *newval;
    4989             :             fsec_t      fsec;
    4990             :             struct pg_tm tt,
    4991           6 :                        *tm = &tt;
    4992             :             int         tz;
    4993             :             int         dtype;
    4994             :             int         nf;
    4995             :             int         dterr;
    4996             :             char       *field[MAXDATEFIELDS];
    4997             :             int         ftype[MAXDATEFIELDS];
    4998             :             char        workbuf[MAXDATELEN + MAXDATEFIELDS];
    4999             :             DateTimeErrorExtra dtextra;
    5000             :             TimestampTz timestamp;
    5001             : 
    5002           6 :             dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
    5003             :                                   field, ftype, MAXDATEFIELDS, &nf);
    5004           6 :             if (dterr == 0)
    5005           6 :                 dterr = DecodeDateTime(field, ftype, nf,
    5006             :                                        &dtype, tm, &fsec, &tz, &dtextra);
    5007           6 :             if (dterr != 0)
    5008           0 :                 return false;
    5009           6 :             if (dtype != DTK_DATE)
    5010           0 :                 return false;
    5011             : 
    5012           6 :             if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
    5013             :             {
    5014           0 :                 GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
    5015           0 :                 return false;
    5016             :             }
    5017             :         }
    5018             :     }
    5019        2354 :     return true;
    5020             : }
    5021             : 
    5022             : /*
    5023             :  * GUC assign_hook for recovery_target_time
    5024             :  */
    5025             : void
    5026        2354 : assign_recovery_target_time(const char *newval, void *extra)
    5027             : {
    5028        2354 :     if (recoveryTarget != RECOVERY_TARGET_UNSET &&
    5029           2 :         recoveryTarget != RECOVERY_TARGET_TIME)
    5030           2 :         error_multiple_recovery_targets();
    5031             : 
    5032        2352 :     if (newval && strcmp(newval, "") != 0)
    5033           4 :         recoveryTarget = RECOVERY_TARGET_TIME;
    5034             :     else
    5035        2348 :         recoveryTarget = RECOVERY_TARGET_UNSET;
    5036        2352 : }
    5037             : 
    5038             : /*
    5039             :  * GUC check_hook for recovery_target_timeline
    5040             :  */
    5041             : bool
    5042        2356 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
    5043             : {
    5044             :     RecoveryTargetTimeLineGoal rttg;
    5045             :     RecoveryTargetTimeLineGoal *myextra;
    5046             : 
    5047        2356 :     if (strcmp(*newval, "current") == 0)
    5048           0 :         rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
    5049        2356 :     else if (strcmp(*newval, "latest") == 0)
    5050        2350 :         rttg = RECOVERY_TARGET_TIMELINE_LATEST;
    5051             :     else
    5052             :     {
    5053             :         char       *endp;
    5054             :         uint64      timeline;
    5055             : 
    5056           6 :         rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
    5057             : 
    5058           6 :         errno = 0;
    5059           6 :         timeline = strtou64(*newval, &endp, 0);
    5060             : 
    5061           6 :         if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
    5062             :         {
    5063           2 :             GUC_check_errdetail("\"%s\" is not a valid number.",
    5064             :                                 "recovery_target_timeline");
    5065           6 :             return false;
    5066             :         }
    5067             : 
    5068           4 :         if (timeline < 1 || timeline > PG_UINT32_MAX)
    5069             :         {
    5070           4 :             GUC_check_errdetail("\"%s\" must be between %u and %u.",
    5071             :                                 "recovery_target_timeline", 1, UINT_MAX);
    5072           4 :             return false;
    5073             :         }
    5074             :     }
    5075             : 
    5076        2350 :     myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal));
    5077        2350 :     if (!myextra)
    5078           0 :         return false;
    5079        2350 :     *myextra = rttg;
    5080        2350 :     *extra = myextra;
    5081             : 
    5082        2350 :     return true;
    5083             : }
    5084             : 
    5085             : /*
    5086             :  * GUC assign_hook for recovery_target_timeline
    5087             :  */
    5088             : void
    5089        2350 : assign_recovery_target_timeline(const char *newval, void *extra)
    5090             : {
    5091        2350 :     recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
    5092        2350 :     if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
    5093           0 :         recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
    5094             :     else
    5095        2350 :         recoveryTargetTLIRequested = 0;
    5096        2350 : }
    5097             : 
    5098             : /*
    5099             :  * GUC check_hook for recovery_target_xid
    5100             :  */
    5101             : bool
    5102        2350 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
    5103             : {
    5104        2350 :     if (strcmp(*newval, "") != 0)
    5105             :     {
    5106             :         TransactionId xid;
    5107             :         TransactionId *myextra;
    5108             : 
    5109           2 :         errno = 0;
    5110           2 :         xid = (TransactionId) strtou64(*newval, NULL, 0);
    5111           2 :         if (errno == EINVAL || errno == ERANGE)
    5112           0 :             return false;
    5113             : 
    5114           2 :         myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
    5115           2 :         if (!myextra)
    5116           0 :             return false;
    5117           2 :         *myextra = xid;
    5118           2 :         *extra = myextra;
    5119             :     }
    5120        2350 :     return true;
    5121             : }
    5122             : 
    5123             : /*
    5124             :  * GUC assign_hook for recovery_target_xid
    5125             :  */
    5126             : void
    5127        2350 : assign_recovery_target_xid(const char *newval, void *extra)
    5128             : {
    5129        2350 :     if (recoveryTarget != RECOVERY_TARGET_UNSET &&
    5130           0 :         recoveryTarget != RECOVERY_TARGET_XID)
    5131           0 :         error_multiple_recovery_targets();
    5132             : 
    5133        2350 :     if (newval && strcmp(newval, "") != 0)
    5134             :     {
    5135           2 :         recoveryTarget = RECOVERY_TARGET_XID;
    5136           2 :         recoveryTargetXid = *((TransactionId *) extra);
    5137             :     }
    5138             :     else
    5139        2348 :         recoveryTarget = RECOVERY_TARGET_UNSET;
    5140        2350 : }

Generated by: LCOV version 1.16