LCOV - code coverage report
Current view: top level - src/backend/access/transam - xlogrecovery.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 86.6 % 1435 1242
Test Date: 2026-03-01 18:15:11 Functions: 98.6 % 69 68
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * xlogrecovery.c
       4              :  *      Functions for WAL recovery, standby mode
       5              :  *
       6              :  * This source file contains functions controlling WAL recovery.
       7              :  * InitWalRecovery() initializes the system for crash or archive recovery,
       8              :  * or standby mode, depending on configuration options and the state of
       9              :  * the control file and possible backup label file.  PerformWalRecovery()
      10              :  * performs the actual WAL replay, calling the rmgr-specific redo routines.
      11              :  * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
      12              :  * and prepares information needed to initialize the WAL for writes.  In
      13              :  * addition to these three main functions, there are a bunch of functions
      14              :  * for interrogating recovery state and controlling the recovery process.
      15              :  *
      16              :  *
      17              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      18              :  * Portions Copyright (c) 1994, Regents of the University of California
      19              :  *
      20              :  * src/backend/access/transam/xlogrecovery.c
      21              :  *
      22              :  *-------------------------------------------------------------------------
      23              :  */
      24              : 
      25              : #include "postgres.h"
      26              : 
      27              : #include <ctype.h>
      28              : #include <time.h>
      29              : #include <sys/stat.h>
      30              : #include <sys/time.h>
      31              : #include <unistd.h>
      32              : 
      33              : #include "access/timeline.h"
      34              : #include "access/transam.h"
      35              : #include "access/xact.h"
      36              : #include "access/xlog_internal.h"
      37              : #include "access/xlogarchive.h"
      38              : #include "access/xlogprefetcher.h"
      39              : #include "access/xlogreader.h"
      40              : #include "access/xlogrecovery.h"
      41              : #include "access/xlogutils.h"
      42              : #include "access/xlogwait.h"
      43              : #include "backup/basebackup.h"
      44              : #include "catalog/pg_control.h"
      45              : #include "commands/tablespace.h"
      46              : #include "common/file_utils.h"
      47              : #include "miscadmin.h"
      48              : #include "nodes/miscnodes.h"
      49              : #include "pgstat.h"
      50              : #include "postmaster/bgwriter.h"
      51              : #include "postmaster/startup.h"
      52              : #include "replication/slot.h"
      53              : #include "replication/slotsync.h"
      54              : #include "replication/walreceiver.h"
      55              : #include "storage/fd.h"
      56              : #include "storage/ipc.h"
      57              : #include "storage/latch.h"
      58              : #include "storage/pmsignal.h"
      59              : #include "storage/procarray.h"
      60              : #include "storage/spin.h"
      61              : #include "utils/datetime.h"
      62              : #include "utils/fmgrprotos.h"
      63              : #include "utils/guc_hooks.h"
      64              : #include "utils/pgstat_internal.h"
      65              : #include "utils/pg_lsn.h"
      66              : #include "utils/ps_status.h"
      67              : #include "utils/pg_rusage.h"
      68              : 
      69              : /* Unsupported old recovery command file names (relative to $PGDATA) */
      70              : #define RECOVERY_COMMAND_FILE   "recovery.conf"
      71              : #define RECOVERY_COMMAND_DONE   "recovery.done"
      72              : 
      73              : /*
      74              :  * GUC support
      75              :  */
      76              : const struct config_enum_entry recovery_target_action_options[] = {
      77              :     {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
      78              :     {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
      79              :     {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
      80              :     {NULL, 0, false}
      81              : };
      82              : 
      83              : /* options formerly taken from recovery.conf for archive recovery */
      84              : char       *recoveryRestoreCommand = NULL;
      85              : char       *recoveryEndCommand = NULL;
      86              : char       *archiveCleanupCommand = NULL;
      87              : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
      88              : bool        recoveryTargetInclusive = true;
      89              : int         recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
      90              : TransactionId recoveryTargetXid;
      91              : char       *recovery_target_time_string;
      92              : TimestampTz recoveryTargetTime;
      93              : const char *recoveryTargetName;
      94              : XLogRecPtr  recoveryTargetLSN;
      95              : int         recovery_min_apply_delay = 0;
      96              : 
      97              : /* options formerly taken from recovery.conf for XLOG streaming */
      98              : char       *PrimaryConnInfo = NULL;
      99              : char       *PrimarySlotName = NULL;
     100              : bool        wal_receiver_create_temp_slot = false;
     101              : 
     102              : /*
     103              :  * recoveryTargetTimeLineGoal: what the user requested, if any
     104              :  *
     105              :  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
     106              :  *
     107              :  * recoveryTargetTLI: the currently understood target timeline; changes
     108              :  *
     109              :  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
     110              :  * the timelines of its known parents, newest first (so recoveryTargetTLI is
     111              :  * always the first list member).  Only these TLIs are expected to be seen in
     112              :  * the WAL segments we read, and indeed only these TLIs will be considered as
     113              :  * candidate WAL files to open at all.
     114              :  *
     115              :  * curFileTLI: the TLI appearing in the name of the current input WAL file.
     116              :  * (This is not necessarily the same as the timeline from which we are
     117              :  * replaying WAL, which StartupXLOG calls replayTLI, because we could be
     118              :  * scanning data that was copied from an ancestor timeline when the current
     119              :  * file was created.)  During a sequential scan we do not allow this value
     120              :  * to decrease.
     121              :  */
     122              : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
     123              : TimeLineID  recoveryTargetTLIRequested = 0;
     124              : TimeLineID  recoveryTargetTLI = 0;
     125              : static List *expectedTLEs;
     126              : static TimeLineID curFileTLI;
     127              : 
     128              : /*
     129              :  * When ArchiveRecoveryRequested is set, archive recovery was requested,
     130              :  * ie. signal files were present.  When InArchiveRecovery is set, we are
     131              :  * currently recovering using offline XLOG archives.  These variables are only
     132              :  * valid in the startup process.
     133              :  *
     134              :  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
     135              :  * currently performing crash recovery using only XLOG files in pg_wal, but
     136              :  * will switch to using offline XLOG archives as soon as we reach the end of
     137              :  * WAL in pg_wal.
     138              :  */
     139              : bool        ArchiveRecoveryRequested = false;
     140              : bool        InArchiveRecovery = false;
     141              : 
     142              : /*
     143              :  * When StandbyModeRequested is set, standby mode was requested, i.e.
     144              :  * standby.signal file was present.  When StandbyMode is set, we are currently
     145              :  * in standby mode.  These variables are only valid in the startup process.
     146              :  * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
     147              :  */
     148              : static bool StandbyModeRequested = false;
     149              : bool        StandbyMode = false;
     150              : 
     151              : /* was a signal file present at startup? */
     152              : static bool standby_signal_file_found = false;
     153              : static bool recovery_signal_file_found = false;
     154              : 
     155              : /*
     156              :  * CheckPointLoc is the position of the checkpoint record that determines
     157              :  * where to start the replay.  It comes from the backup label file or the
     158              :  * control file.
     159              :  *
     160              :  * RedoStartLSN is the checkpoint's REDO location, also from the backup label
     161              :  * file or the control file.  In standby mode, XLOG streaming usually starts
     162              :  * from the position where an invalid record was found.  But if we fail to
     163              :  * read even the initial checkpoint record, we use the REDO location instead
     164              :  * of the checkpoint location as the start position of XLOG streaming.
     165              :  * Otherwise we would have to jump backwards to the REDO location after
     166              :  * reading the checkpoint record, because the REDO record can precede the
     167              :  * checkpoint record.
     168              :  */
     169              : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
     170              : static TimeLineID CheckPointTLI = 0;
     171              : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
     172              : static TimeLineID RedoStartTLI = 0;
     173              : 
     174              : /*
     175              :  * Local copy of SharedHotStandbyActive variable. False actually means "not
     176              :  * known, need to check the shared state".
     177              :  */
     178              : static bool LocalHotStandbyActive = false;
     179              : 
     180              : /*
     181              :  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
     182              :  * known, need to check the shared state".
     183              :  */
     184              : static bool LocalPromoteIsTriggered = false;
     185              : 
     186              : /* Has the recovery code requested a walreceiver wakeup? */
     187              : static bool doRequestWalReceiverReply;
     188              : 
     189              : /* XLogReader object used to parse the WAL records */
     190              : static XLogReaderState *xlogreader = NULL;
     191              : 
     192              : /* XLogPrefetcher object used to consume WAL records with read-ahead */
     193              : static XLogPrefetcher *xlogprefetcher = NULL;
     194              : 
     195              : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
     196              : typedef struct XLogPageReadPrivate
     197              : {
     198              :     int         emode;
     199              :     bool        fetching_ckpt;  /* are we fetching a checkpoint record? */
     200              :     bool        randAccess;
     201              :     TimeLineID  replayTLI;
     202              : } XLogPageReadPrivate;
     203              : 
     204              : /* flag to tell XLogPageRead that we have started replaying */
     205              : static bool InRedo = false;
     206              : 
     207              : /*
     208              :  * Codes indicating where we got a WAL file from during recovery, or where
     209              :  * to attempt to get one.
     210              :  */
     211              : typedef enum
     212              : {
     213              :     XLOG_FROM_ANY = 0,          /* request to read WAL from any source */
     214              :     XLOG_FROM_ARCHIVE,          /* restored using restore_command */
     215              :     XLOG_FROM_PG_WAL,           /* existing file in pg_wal */
     216              :     XLOG_FROM_STREAM,           /* streamed from primary */
     217              : } XLogSource;
     218              : 
     219              : /* human-readable names for XLogSources, for debugging output */
     220              : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
     221              : 
     222              : /*
     223              :  * readFile is -1 or a kernel FD for the log file segment that's currently
     224              :  * open for reading.  readSegNo identifies the segment.  readOff is the offset
     225              :  * of the page just read, readLen indicates how much of it has been read into
     226              :  * readBuf, and readSource indicates where we got the currently open file from.
     227              :  *
     228              :  * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
     229              :  * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
     230              :  * worthwhile, since the XLOG is not read by general-purpose sessions.
     231              :  */
     232              : static int  readFile = -1;
     233              : static XLogSegNo readSegNo = 0;
     234              : static uint32 readOff = 0;
     235              : static uint32 readLen = 0;
     236              : static XLogSource readSource = XLOG_FROM_ANY;
     237              : 
     238              : /*
     239              :  * Keeps track of which source we're currently reading from. This is
     240              :  * different from readSource in that this is always set, even when we don't
     241              :  * currently have a WAL file open. If lastSourceFailed is set, our last
     242              :  * attempt to read from currentSource failed, and we should try another source
     243              :  * next.
     244              :  *
     245              :  * pendingWalRcvRestart is set when a config change occurs that requires a
     246              :  * walreceiver restart.  This is only valid in XLOG_FROM_STREAM state.
     247              :  */
     248              : static XLogSource currentSource = XLOG_FROM_ANY;
     249              : static bool lastSourceFailed = false;
     250              : static bool pendingWalRcvRestart = false;
     251              : 
     252              : /*
     253              :  * These variables track when we last obtained some WAL data to process,
     254              :  * and where we got it from.  (XLogReceiptSource is initially the same as
     255              :  * readSource, but readSource gets reset to zero when we don't have data
     256              :  * to process right now.  It is also different from currentSource, which
     257              :  * also changes when we try to read from a source and fail, while
     258              :  * XLogReceiptSource tracks where we last successfully read some WAL.)
     259              :  */
     260              : static TimestampTz XLogReceiptTime = 0;
     261              : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
     262              : 
     263              : /* Local copy of WalRcv->flushedUpto */
     264              : static XLogRecPtr flushedUpto = InvalidXLogRecPtr;
     265              : static TimeLineID receiveTLI = 0;
     266              : 
     267              : /*
     268              :  * Copy of minRecoveryPoint and backupEndPoint from the control file.
     269              :  *
     270              :  * In order to reach consistency, we must replay the WAL up to
     271              :  * minRecoveryPoint.  If backupEndRequired is true, we must also reach
     272              :  * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
     273              :  * to backupStartPoint.
     274              :  *
     275              :  * Note: In archive recovery, after consistency has been reached, the
     276              :  * functions in xlog.c will start updating minRecoveryPoint in the control
     277              :  * file.  But this copy of minRecoveryPoint variable reflects the value at the
     278              :  * beginning of recovery, and is *not* updated after consistency is reached.
     279              :  */
     280              : static XLogRecPtr minRecoveryPoint;
     281              : static TimeLineID minRecoveryPointTLI;
     282              : 
     283              : static XLogRecPtr backupStartPoint;
     284              : static XLogRecPtr backupEndPoint;
     285              : static bool backupEndRequired = false;
     286              : 
     287              : /*
     288              :  * Have we reached a consistent database state?  In crash recovery, we have
     289              :  * to replay all the WAL, so reachedConsistency is never set.  During archive
     290              :  * recovery, the database is consistent once minRecoveryPoint is reached.
     291              :  *
     292              :  * Consistent state means that the system is internally consistent, all
     293              :  * the WAL has been replayed up to a certain point, and importantly, there
     294              :  * is no trace of later actions on disk.
     295              :  *
     296              :  * This flag is used only by the startup process and postmaster. When
     297              :  * minRecoveryPoint is reached, the startup process sets it to true and
     298              :  * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
     299              :  * which then sets it to true upon receiving the signal.
     300              :  */
     301              : bool        reachedConsistency = false;
     302              : 
     303              : /* Buffers dedicated to consistency checks of size BLCKSZ */
     304              : static char *replay_image_masked = NULL;
     305              : static char *primary_image_masked = NULL;
     306              : 
     307              : 
     308              : /*
     309              :  * Shared-memory state for WAL recovery.
     310              :  */
     311              : typedef struct XLogRecoveryCtlData
     312              : {
     313              :     /*
     314              :      * SharedHotStandbyActive indicates if we allow hot standby queries to be
     315              :      * run.  Protected by info_lck.
     316              :      */
     317              :     bool        SharedHotStandbyActive;
     318              : 
     319              :     /*
     320              :      * SharedPromoteIsTriggered indicates if a standby promotion has been
     321              :      * triggered.  Protected by info_lck.
     322              :      */
     323              :     bool        SharedPromoteIsTriggered;
     324              : 
     325              :     /*
     326              :      * recoveryWakeupLatch is used to wake up the startup process to continue
     327              :      * WAL replay, if it is waiting for WAL to arrive or promotion to be
     328              :      * requested.
     329              :      *
     330              :      * Note that the startup process also uses another latch, its procLatch,
     331              :      * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
     332              :      * signaling the startup process in favor of using its procLatch, which
     333              :      * comports better with possible generic signal handlers using that latch.
     334              :      * But we should not do that because the startup process doesn't assume
     335              :      * that it's waken up by walreceiver process or SIGHUP signal handler
     336              :      * while it's waiting for recovery conflict. The separate latches,
     337              :      * recoveryWakeupLatch and procLatch, should be used for inter-process
     338              :      * communication for WAL replay and recovery conflict, respectively.
     339              :      */
     340              :     Latch       recoveryWakeupLatch;
     341              : 
     342              :     /*
     343              :      * Last record successfully replayed.
     344              :      */
     345              :     XLogRecPtr  lastReplayedReadRecPtr; /* start position */
     346              :     XLogRecPtr  lastReplayedEndRecPtr;  /* end+1 position */
     347              :     TimeLineID  lastReplayedTLI;    /* timeline */
     348              : 
     349              :     /*
     350              :      * When we're currently replaying a record, ie. in a redo function,
     351              :      * replayEndRecPtr points to the end+1 of the record being replayed,
     352              :      * otherwise it's equal to lastReplayedEndRecPtr.
     353              :      */
     354              :     XLogRecPtr  replayEndRecPtr;
     355              :     TimeLineID  replayEndTLI;
     356              :     /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
     357              :     TimestampTz recoveryLastXTime;
     358              : 
     359              :     /*
     360              :      * timestamp of when we started replaying the current chunk of WAL data,
     361              :      * only relevant for replication or archive recovery
     362              :      */
     363              :     TimestampTz currentChunkStartTime;
     364              :     /* Recovery pause state */
     365              :     RecoveryPauseState recoveryPauseState;
     366              :     ConditionVariable recoveryNotPausedCV;
     367              : 
     368              :     slock_t     info_lck;       /* locks shared variables shown above */
     369              : } XLogRecoveryCtlData;
     370              : 
     371              : static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
     372              : 
     373              : /*
     374              :  * abortedRecPtr is the start pointer of a broken record at end of WAL when
     375              :  * recovery completes; missingContrecPtr is the location of the first
     376              :  * contrecord that went missing.  See CreateOverwriteContrecordRecord for
     377              :  * details.
     378              :  */
     379              : static XLogRecPtr abortedRecPtr;
     380              : static XLogRecPtr missingContrecPtr;
     381              : 
     382              : /*
     383              :  * if recoveryStopsBefore/After returns true, it saves information of the stop
     384              :  * point here
     385              :  */
     386              : static TransactionId recoveryStopXid;
     387              : static TimestampTz recoveryStopTime;
     388              : static XLogRecPtr recoveryStopLSN;
     389              : static char recoveryStopName[MAXFNAMELEN];
     390              : static bool recoveryStopAfter;
     391              : 
     392              : /* prototypes for local functions */
     393              : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
     394              : 
     395              : static void EnableStandbyMode(void);
     396              : static void readRecoverySignalFile(void);
     397              : static void validateRecoveryParameters(void);
     398              : static bool read_backup_label(XLogRecPtr *checkPointLoc,
     399              :                               TimeLineID *backupLabelTLI,
     400              :                               bool *backupEndRequired, bool *backupFromStandby);
     401              : static bool read_tablespace_map(List **tablespaces);
     402              : 
     403              : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
     404              : static void CheckRecoveryConsistency(void);
     405              : static void rm_redo_error_callback(void *arg);
     406              : #ifdef WAL_DEBUG
     407              : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
     408              : #endif
     409              : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
     410              : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
     411              :                                 TimeLineID prevTLI, TimeLineID replayTLI);
     412              : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
     413              : static void verifyBackupPageConsistency(XLogReaderState *record);
     414              : 
     415              : static bool recoveryStopsBefore(XLogReaderState *record);
     416              : static bool recoveryStopsAfter(XLogReaderState *record);
     417              : static char *getRecoveryStopReason(void);
     418              : static void recoveryPausesHere(bool endOfRecovery);
     419              : static bool recoveryApplyDelay(XLogReaderState *record);
     420              : static void ConfirmRecoveryPaused(void);
     421              : 
     422              : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
     423              :                               int emode, bool fetching_ckpt,
     424              :                               TimeLineID replayTLI);
     425              : 
     426              : static int  XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
     427              :                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
     428              : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
     429              :                                                       bool randAccess,
     430              :                                                       bool fetching_ckpt,
     431              :                                                       XLogRecPtr tliRecPtr,
     432              :                                                       TimeLineID replayTLI,
     433              :                                                       XLogRecPtr replayLSN,
     434              :                                                       bool nonblocking);
     435              : static int  emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
     436              : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
     437              :                                         XLogRecPtr RecPtr, TimeLineID replayTLI);
     438              : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
     439              : static int  XLogFileRead(XLogSegNo segno, TimeLineID tli,
     440              :                          XLogSource source, bool notfoundOk);
     441              : static int  XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
     442              : 
     443              : static bool CheckForStandbyTrigger(void);
     444              : static void SetPromoteIsTriggered(void);
     445              : static bool HotStandbyActiveInReplay(void);
     446              : 
     447              : static void SetCurrentChunkStartTime(TimestampTz xtime);
     448              : static void SetLatestXTime(TimestampTz xtime);
     449              : 
     450              : /*
     451              :  * Initialization of shared memory for WAL recovery
     452              :  */
     453              : Size
     454         3297 : XLogRecoveryShmemSize(void)
     455              : {
     456              :     Size        size;
     457              : 
     458              :     /* XLogRecoveryCtl */
     459         3297 :     size = sizeof(XLogRecoveryCtlData);
     460              : 
     461         3297 :     return size;
     462              : }
     463              : 
     464              : void
     465         1150 : XLogRecoveryShmemInit(void)
     466              : {
     467              :     bool        found;
     468              : 
     469         1150 :     XLogRecoveryCtl = (XLogRecoveryCtlData *)
     470         1150 :         ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
     471         1150 :     if (found)
     472            0 :         return;
     473         1150 :     memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
     474              : 
     475         1150 :     SpinLockInit(&XLogRecoveryCtl->info_lck);
     476         1150 :     InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
     477         1150 :     ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
     478              : }
     479              : 
     480              : /*
     481              :  * A thin wrapper to enable StandbyMode and do other preparatory work as
     482              :  * needed.
     483              :  */
     484              : static void
     485          113 : EnableStandbyMode(void)
     486              : {
     487          113 :     StandbyMode = true;
     488              : 
     489              :     /*
     490              :      * To avoid server log bloat, we don't report recovery progress in a
     491              :      * standby as it will always be in recovery unless promoted. We disable
     492              :      * startup progress timeout in standby mode to avoid calling
     493              :      * startup_progress_timeout_handler() unnecessarily.
     494              :      */
     495          113 :     disable_startup_progress_timeout();
     496          113 : }
     497              : 
     498              : /*
     499              :  * Prepare the system for WAL recovery, if needed.
     500              :  *
     501              :  * This is called by StartupXLOG() which coordinates the server startup
     502              :  * sequence.  This function analyzes the control file and the backup label
     503              :  * file, if any, and figures out whether we need to perform crash recovery or
     504              :  * archive recovery, and how far we need to replay the WAL to reach a
     505              :  * consistent state.
     506              :  *
     507              :  * This doesn't yet change the on-disk state, except for creating the symlinks
     508              :  * from table space map file if any, and for fetching WAL files needed to find
     509              :  * the checkpoint record.  On entry, the caller has already read the control
     510              :  * file into memory, and passes it as argument.  This function updates it to
     511              :  * reflect the recovery state, and the caller is expected to write it back to
     512              :  * disk does after initializing other subsystems, but before calling
     513              :  * PerformWalRecovery().
     514              :  *
     515              :  * This initializes some global variables like ArchiveRecoveryRequested, and
     516              :  * StandbyModeRequested and InRecovery.
     517              :  */
     518              : void
     519         1006 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
     520              :                 bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
     521              : {
     522              :     XLogPageReadPrivate *private;
     523              :     struct stat st;
     524              :     bool        wasShutdown;
     525              :     XLogRecord *record;
     526              :     DBState     dbstate_at_startup;
     527         1006 :     bool        haveTblspcMap = false;
     528         1006 :     bool        haveBackupLabel = false;
     529              :     CheckPoint  checkPoint;
     530         1006 :     bool        backupFromStandby = false;
     531              : 
     532         1006 :     dbstate_at_startup = ControlFile->state;
     533              : 
     534              :     /*
     535              :      * Initialize on the assumption we want to recover to the latest timeline
     536              :      * that's active according to pg_control.
     537              :      */
     538         1006 :     if (ControlFile->minRecoveryPointTLI >
     539         1006 :         ControlFile->checkPointCopy.ThisTimeLineID)
     540            2 :         recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
     541              :     else
     542         1004 :         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
     543              : 
     544              :     /*
     545              :      * Check for signal files, and if so set up state for offline recovery
     546              :      */
     547         1006 :     readRecoverySignalFile();
     548         1006 :     validateRecoveryParameters();
     549              : 
     550              :     /*
     551              :      * Take ownership of the wakeup latch if we're going to sleep during
     552              :      * recovery, if required.
     553              :      */
     554         1006 :     if (ArchiveRecoveryRequested)
     555          118 :         OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
     556              : 
     557              :     /*
     558              :      * Set the WAL reading processor now, as it will be needed when reading
     559              :      * the checkpoint record required (backup_label or not).
     560              :      */
     561         1006 :     private = palloc0_object(XLogPageReadPrivate);
     562         1006 :     xlogreader =
     563         1006 :         XLogReaderAllocate(wal_segment_size, NULL,
     564         1006 :                            XL_ROUTINE(.page_read = &XLogPageRead,
     565              :                                       .segment_open = NULL,
     566              :                                       .segment_close = wal_segment_close),
     567              :                            private);
     568         1006 :     if (!xlogreader)
     569            0 :         ereport(ERROR,
     570              :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     571              :                  errmsg("out of memory"),
     572              :                  errdetail("Failed while allocating a WAL reading processor.")));
     573         1006 :     xlogreader->system_identifier = ControlFile->system_identifier;
     574              : 
     575              :     /*
     576              :      * Set the WAL decode buffer size.  This limits how far ahead we can read
     577              :      * in the WAL.
     578              :      */
     579         1006 :     XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
     580              : 
     581              :     /* Create a WAL prefetcher. */
     582         1006 :     xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
     583              : 
     584              :     /*
     585              :      * Allocate two page buffers dedicated to WAL consistency checks.  We do
     586              :      * it this way, rather than just making static arrays, for two reasons:
     587              :      * (1) no need to waste the storage in most instantiations of the backend;
     588              :      * (2) a static char array isn't guaranteed to have any particular
     589              :      * alignment, whereas palloc() will provide MAXALIGN'd storage.
     590              :      */
     591         1006 :     replay_image_masked = (char *) palloc(BLCKSZ);
     592         1006 :     primary_image_masked = (char *) palloc(BLCKSZ);
     593              : 
     594              :     /*
     595              :      * Read the backup_label file.  We want to run this part of the recovery
     596              :      * process after checking for signal files and after performing validation
     597              :      * of the recovery parameters.
     598              :      */
     599         1006 :     if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
     600              :                           &backupFromStandby))
     601              :     {
     602           81 :         List       *tablespaces = NIL;
     603              : 
     604              :         /*
     605              :          * Archive recovery was requested, and thanks to the backup label
     606              :          * file, we know how far we need to replay to reach consistency. Enter
     607              :          * archive recovery directly.
     608              :          */
     609           81 :         InArchiveRecovery = true;
     610           81 :         if (StandbyModeRequested)
     611           69 :             EnableStandbyMode();
     612              : 
     613              :         /*
     614              :          * Omitting backup_label when creating a new replica, PITR node etc.
     615              :          * unfortunately is a common cause of corruption.  Logging that
     616              :          * backup_label was used makes it a bit easier to exclude that as the
     617              :          * cause of observed corruption.
     618              :          *
     619              :          * Do so before we try to read the checkpoint record (which can fail),
     620              :          * as otherwise it can be hard to understand why a checkpoint other
     621              :          * than ControlFile->checkPoint is used.
     622              :          */
     623           81 :         ereport(LOG,
     624              :                 errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
     625              :                        LSN_FORMAT_ARGS(RedoStartLSN),
     626              :                        LSN_FORMAT_ARGS(CheckPointLoc),
     627              :                        CheckPointTLI));
     628              : 
     629              :         /*
     630              :          * When a backup_label file is present, we want to roll forward from
     631              :          * the checkpoint it identifies, rather than using pg_control.
     632              :          */
     633           81 :         record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
     634              :                                       CheckPointTLI);
     635           81 :         if (record != NULL)
     636              :         {
     637           81 :             memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
     638           81 :             wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
     639           81 :             ereport(DEBUG1,
     640              :                     errmsg_internal("checkpoint record is at %X/%08X",
     641              :                                     LSN_FORMAT_ARGS(CheckPointLoc)));
     642           81 :             InRecovery = true;  /* force recovery even if SHUTDOWNED */
     643              : 
     644              :             /*
     645              :              * Make sure that REDO location exists. This may not be the case
     646              :              * if there was a crash during an online backup, which left a
     647              :              * backup_label around that references a WAL segment that's
     648              :              * already been archived.
     649              :              */
     650           81 :             if (checkPoint.redo < CheckPointLoc)
     651              :             {
     652           81 :                 XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
     653           81 :                 if (!ReadRecord(xlogprefetcher, LOG, false,
     654              :                                 checkPoint.ThisTimeLineID))
     655            0 :                     ereport(FATAL,
     656              :                             errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
     657              :                                    LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
     658              :                             errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
     659              :                                     "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
     660              :                                     "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
     661              :                                     DataDir, DataDir, DataDir, DataDir));
     662              :             }
     663              :         }
     664              :         else
     665              :         {
     666            0 :             ereport(FATAL,
     667              :                     errmsg("could not locate required checkpoint record at %X/%08X",
     668              :                            LSN_FORMAT_ARGS(CheckPointLoc)),
     669              :                     errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
     670              :                             "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
     671              :                             "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
     672              :                             DataDir, DataDir, DataDir, DataDir));
     673              :             wasShutdown = false;    /* keep compiler quiet */
     674              :         }
     675              : 
     676              :         /* Read the tablespace_map file if present and create symlinks. */
     677           81 :         if (read_tablespace_map(&tablespaces))
     678              :         {
     679              :             ListCell   *lc;
     680              : 
     681            4 :             foreach(lc, tablespaces)
     682              :             {
     683            2 :                 tablespaceinfo *ti = lfirst(lc);
     684              :                 char       *linkloc;
     685              : 
     686            2 :                 linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
     687              : 
     688              :                 /*
     689              :                  * Remove the existing symlink if any and Create the symlink
     690              :                  * under PGDATA.
     691              :                  */
     692            2 :                 remove_tablespace_symlink(linkloc);
     693              : 
     694            2 :                 if (symlink(ti->path, linkloc) < 0)
     695            0 :                     ereport(ERROR,
     696              :                             (errcode_for_file_access(),
     697              :                              errmsg("could not create symbolic link \"%s\": %m",
     698              :                                     linkloc)));
     699              : 
     700            2 :                 pfree(ti->path);
     701            2 :                 pfree(ti);
     702              :             }
     703              : 
     704              :             /* tell the caller to delete it later */
     705            2 :             haveTblspcMap = true;
     706              :         }
     707              : 
     708              :         /* tell the caller to delete it later */
     709           81 :         haveBackupLabel = true;
     710              :     }
     711              :     else
     712              :     {
     713              :         /* No backup_label file has been found if we are here. */
     714              : 
     715              :         /*
     716              :          * If tablespace_map file is present without backup_label file, there
     717              :          * is no use of such file.  There is no harm in retaining it, but it
     718              :          * is better to get rid of the map file so that we don't have any
     719              :          * redundant file in data directory and it will avoid any sort of
     720              :          * confusion.  It seems prudent though to just rename the file out of
     721              :          * the way rather than delete it completely, also we ignore any error
     722              :          * that occurs in rename operation as even if map file is present
     723              :          * without backup_label file, it is harmless.
     724              :          */
     725          925 :         if (stat(TABLESPACE_MAP, &st) == 0)
     726              :         {
     727            1 :             unlink(TABLESPACE_MAP_OLD);
     728            1 :             if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
     729            1 :                 ereport(LOG,
     730              :                         (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
     731              :                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
     732              :                          errdetail("File \"%s\" was renamed to \"%s\".",
     733              :                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
     734              :             else
     735            0 :                 ereport(LOG,
     736              :                         (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
     737              :                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
     738              :                          errdetail("Could not rename file \"%s\" to \"%s\": %m.",
     739              :                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
     740              :         }
     741              : 
     742              :         /*
     743              :          * It's possible that archive recovery was requested, but we don't
     744              :          * know how far we need to replay the WAL before we reach consistency.
     745              :          * This can happen for example if a base backup is taken from a
     746              :          * running server using an atomic filesystem snapshot, without calling
     747              :          * pg_backup_start/stop. Or if you just kill a running primary server
     748              :          * and put it into archive recovery by creating a recovery signal
     749              :          * file.
     750              :          *
     751              :          * Our strategy in that case is to perform crash recovery first,
     752              :          * replaying all the WAL present in pg_wal, and only enter archive
     753              :          * recovery after that.
     754              :          *
     755              :          * But usually we already know how far we need to replay the WAL (up
     756              :          * to minRecoveryPoint, up to backupEndPoint, or until we see an
     757              :          * end-of-backup record), and we can enter archive recovery directly.
     758              :          */
     759          925 :         if (ArchiveRecoveryRequested &&
     760           44 :             (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) ||
     761            9 :              ControlFile->backupEndRequired ||
     762            9 :              XLogRecPtrIsValid(ControlFile->backupEndPoint) ||
     763            9 :              ControlFile->state == DB_SHUTDOWNED))
     764              :         {
     765           42 :             InArchiveRecovery = true;
     766           42 :             if (StandbyModeRequested)
     767           42 :                 EnableStandbyMode();
     768              :         }
     769              : 
     770              :         /*
     771              :          * For the same reason as when starting up with backup_label present,
     772              :          * emit a log message when we continue initializing from a base
     773              :          * backup.
     774              :          */
     775          925 :         if (XLogRecPtrIsValid(ControlFile->backupStartPoint))
     776            0 :             ereport(LOG,
     777              :                     errmsg("restarting backup recovery with redo LSN %X/%08X",
     778              :                            LSN_FORMAT_ARGS(ControlFile->backupStartPoint)));
     779              : 
     780              :         /* Get the last valid checkpoint record. */
     781          925 :         CheckPointLoc = ControlFile->checkPoint;
     782          925 :         CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
     783          925 :         RedoStartLSN = ControlFile->checkPointCopy.redo;
     784          925 :         RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
     785          925 :         record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
     786              :                                       CheckPointTLI);
     787          925 :         if (record != NULL)
     788              :         {
     789          925 :             ereport(DEBUG1,
     790              :                     errmsg_internal("checkpoint record is at %X/%08X",
     791              :                                     LSN_FORMAT_ARGS(CheckPointLoc)));
     792              :         }
     793              :         else
     794              :         {
     795              :             /*
     796              :              * We used to attempt to go back to a secondary checkpoint record
     797              :              * here, but only when not in standby mode. We now just fail if we
     798              :              * can't read the last checkpoint because this allows us to
     799              :              * simplify processing around checkpoints.
     800              :              */
     801            0 :             ereport(PANIC,
     802              :                     errmsg("could not locate a valid checkpoint record at %X/%08X",
     803              :                            LSN_FORMAT_ARGS(CheckPointLoc)));
     804              :         }
     805          925 :         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
     806          925 :         wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
     807              : 
     808              :         /* Make sure that REDO location exists. */
     809          925 :         if (checkPoint.redo < CheckPointLoc)
     810              :         {
     811           44 :             XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
     812           44 :             if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
     813            1 :                 ereport(FATAL,
     814              :                         errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
     815              :                                LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)));
     816              :         }
     817              :     }
     818              : 
     819         1005 :     if (ArchiveRecoveryRequested)
     820              :     {
     821          118 :         if (StandbyModeRequested)
     822          113 :             ereport(LOG,
     823              :                     (errmsg("entering standby mode")));
     824            5 :         else if (recoveryTarget == RECOVERY_TARGET_XID)
     825            0 :             ereport(LOG,
     826              :                     (errmsg("starting point-in-time recovery to XID %u",
     827              :                             recoveryTargetXid)));
     828            5 :         else if (recoveryTarget == RECOVERY_TARGET_TIME)
     829            0 :             ereport(LOG,
     830              :                     (errmsg("starting point-in-time recovery to %s",
     831              :                             timestamptz_to_str(recoveryTargetTime))));
     832            5 :         else if (recoveryTarget == RECOVERY_TARGET_NAME)
     833            3 :             ereport(LOG,
     834              :                     (errmsg("starting point-in-time recovery to \"%s\"",
     835              :                             recoveryTargetName)));
     836            2 :         else if (recoveryTarget == RECOVERY_TARGET_LSN)
     837            0 :             ereport(LOG,
     838              :                     errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
     839              :                            LSN_FORMAT_ARGS(recoveryTargetLSN)));
     840            2 :         else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
     841            0 :             ereport(LOG,
     842              :                     (errmsg("starting point-in-time recovery to earliest consistent point")));
     843              :         else
     844            2 :             ereport(LOG,
     845              :                     (errmsg("starting archive recovery")));
     846              :     }
     847              : 
     848              :     /*
     849              :      * If the location of the checkpoint record is not on the expected
     850              :      * timeline in the history of the requested timeline, we cannot proceed:
     851              :      * the backup is not part of the history of the requested timeline.
     852              :      */
     853              :     Assert(expectedTLEs);       /* was initialized by reading checkpoint
     854              :                                  * record */
     855         1005 :     if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
     856              :         CheckPointTLI)
     857              :     {
     858              :         XLogRecPtr  switchpoint;
     859              : 
     860              :         /*
     861              :          * tliSwitchPoint will throw an error if the checkpoint's timeline is
     862              :          * not in expectedTLEs at all.
     863              :          */
     864            0 :         switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
     865            0 :         ereport(FATAL,
     866              :                 (errmsg("requested timeline %u is not a child of this server's history",
     867              :                         recoveryTargetTLI),
     868              :         /* translator: %s is a backup_label file or a pg_control file */
     869              :                  errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
     870              :                            haveBackupLabel ? "backup_label" : "pg_control",
     871              :                            LSN_FORMAT_ARGS(CheckPointLoc),
     872              :                            CheckPointTLI,
     873              :                            LSN_FORMAT_ARGS(switchpoint))));
     874              :     }
     875              : 
     876              :     /*
     877              :      * The min recovery point should be part of the requested timeline's
     878              :      * history, too.
     879              :      */
     880         1005 :     if (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) &&
     881           42 :         tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
     882           42 :         ControlFile->minRecoveryPointTLI)
     883            0 :         ereport(FATAL,
     884              :                 errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
     885              :                        recoveryTargetTLI,
     886              :                        LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
     887              :                        ControlFile->minRecoveryPointTLI));
     888              : 
     889         1005 :     ereport(DEBUG1,
     890              :             errmsg_internal("redo record is at %X/%08X; shutdown %s",
     891              :                             LSN_FORMAT_ARGS(checkPoint.redo),
     892              :                             wasShutdown ? "true" : "false"));
     893         1005 :     ereport(DEBUG1,
     894              :             (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
     895              :                              U64FromFullTransactionId(checkPoint.nextXid),
     896              :                              checkPoint.nextOid)));
     897         1005 :     ereport(DEBUG1,
     898              :             (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64,
     899              :                              checkPoint.nextMulti, checkPoint.nextMultiOffset)));
     900         1005 :     ereport(DEBUG1,
     901              :             (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
     902              :                              checkPoint.oldestXid, checkPoint.oldestXidDB)));
     903         1005 :     ereport(DEBUG1,
     904              :             (errmsg_internal("oldest MultiXactId: %u, in database %u",
     905              :                              checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
     906         1005 :     ereport(DEBUG1,
     907              :             (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
     908              :                              checkPoint.oldestCommitTsXid,
     909              :                              checkPoint.newestCommitTsXid)));
     910         1005 :     if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
     911            0 :         ereport(PANIC,
     912              :                 (errmsg("invalid next transaction ID")));
     913              : 
     914              :     /* sanity check */
     915         1005 :     if (checkPoint.redo > CheckPointLoc)
     916            0 :         ereport(PANIC,
     917              :                 (errmsg("invalid redo in checkpoint record")));
     918              : 
     919              :     /*
     920              :      * Check whether we need to force recovery from WAL.  If it appears to
     921              :      * have been a clean shutdown and we did not have a recovery signal file,
     922              :      * then assume no recovery needed.
     923              :      */
     924         1005 :     if (checkPoint.redo < CheckPointLoc)
     925              :     {
     926          124 :         if (wasShutdown)
     927            0 :             ereport(PANIC,
     928              :                     (errmsg("invalid redo record in shutdown checkpoint")));
     929          124 :         InRecovery = true;
     930              :     }
     931          881 :     else if (ControlFile->state != DB_SHUTDOWNED)
     932           95 :         InRecovery = true;
     933          786 :     else if (ArchiveRecoveryRequested)
     934              :     {
     935              :         /* force recovery due to presence of recovery signal file */
     936            7 :         InRecovery = true;
     937              :     }
     938              : 
     939              :     /*
     940              :      * If recovery is needed, update our in-memory copy of pg_control to show
     941              :      * that we are recovering and to show the selected checkpoint as the place
     942              :      * we are starting from. We also mark pg_control with any minimum recovery
     943              :      * stop point obtained from a backup history file.
     944              :      *
     945              :      * We don't write the changes to disk yet, though. Only do that after
     946              :      * initializing various subsystems.
     947              :      */
     948         1005 :     if (InRecovery)
     949              :     {
     950          226 :         if (InArchiveRecovery)
     951              :         {
     952          123 :             ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
     953              :         }
     954              :         else
     955              :         {
     956          103 :             ereport(LOG,
     957              :                     (errmsg("database system was not properly shut down; "
     958              :                             "automatic recovery in progress")));
     959          103 :             if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
     960            2 :                 ereport(LOG,
     961              :                         (errmsg("crash recovery starts in timeline %u "
     962              :                                 "and has target timeline %u",
     963              :                                 ControlFile->checkPointCopy.ThisTimeLineID,
     964              :                                 recoveryTargetTLI)));
     965          103 :             ControlFile->state = DB_IN_CRASH_RECOVERY;
     966              :         }
     967          226 :         ControlFile->checkPoint = CheckPointLoc;
     968          226 :         ControlFile->checkPointCopy = checkPoint;
     969          226 :         if (InArchiveRecovery)
     970              :         {
     971              :             /* initialize minRecoveryPoint if not set yet */
     972          123 :             if (ControlFile->minRecoveryPoint < checkPoint.redo)
     973              :             {
     974           83 :                 ControlFile->minRecoveryPoint = checkPoint.redo;
     975           83 :                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
     976              :             }
     977              :         }
     978              : 
     979              :         /*
     980              :          * Set backupStartPoint if we're starting recovery from a base backup.
     981              :          *
     982              :          * Also set backupEndPoint and use minRecoveryPoint as the backup end
     983              :          * location if we're starting recovery from a base backup which was
     984              :          * taken from a standby. In this case, the database system status in
     985              :          * pg_control must indicate that the database was already in recovery.
     986              :          * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
     987              :          * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
     988              :          * before reaching this point; e.g. because restore_command or
     989              :          * primary_conninfo were faulty.
     990              :          *
     991              :          * Any other state indicates that the backup somehow became corrupted
     992              :          * and we can't sensibly continue with recovery.
     993              :          */
     994          226 :         if (haveBackupLabel)
     995              :         {
     996           81 :             ControlFile->backupStartPoint = checkPoint.redo;
     997           81 :             ControlFile->backupEndRequired = backupEndRequired;
     998              : 
     999           81 :             if (backupFromStandby)
    1000              :             {
    1001            5 :                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
    1002              :                     dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
    1003            0 :                     ereport(FATAL,
    1004              :                             (errmsg("backup_label contains data inconsistent with control file"),
    1005              :                              errhint("This means that the backup is corrupted and you will "
    1006              :                                      "have to use another backup for recovery.")));
    1007            5 :                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
    1008              :             }
    1009              :         }
    1010              :     }
    1011              : 
    1012              :     /* remember these, so that we know when we have reached consistency */
    1013         1005 :     backupStartPoint = ControlFile->backupStartPoint;
    1014         1005 :     backupEndRequired = ControlFile->backupEndRequired;
    1015         1005 :     backupEndPoint = ControlFile->backupEndPoint;
    1016         1005 :     if (InArchiveRecovery)
    1017              :     {
    1018          123 :         minRecoveryPoint = ControlFile->minRecoveryPoint;
    1019          123 :         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    1020              :     }
    1021              :     else
    1022              :     {
    1023          882 :         minRecoveryPoint = InvalidXLogRecPtr;
    1024          882 :         minRecoveryPointTLI = 0;
    1025              :     }
    1026              : 
    1027              :     /*
    1028              :      * Start recovery assuming that the final record isn't lost.
    1029              :      */
    1030         1005 :     abortedRecPtr = InvalidXLogRecPtr;
    1031         1005 :     missingContrecPtr = InvalidXLogRecPtr;
    1032              : 
    1033         1005 :     *wasShutdown_ptr = wasShutdown;
    1034         1005 :     *haveBackupLabel_ptr = haveBackupLabel;
    1035         1005 :     *haveTblspcMap_ptr = haveTblspcMap;
    1036         1005 : }
    1037              : 
    1038              : /*
    1039              :  * See if there are any recovery signal files and if so, set state for
    1040              :  * recovery.
    1041              :  *
    1042              :  * See if there is a recovery command file (recovery.conf), and if so
    1043              :  * throw an ERROR since as of PG12 we no longer recognize that.
    1044              :  */
    1045              : static void
    1046         1006 : readRecoverySignalFile(void)
    1047              : {
    1048              :     struct stat stat_buf;
    1049              : 
    1050         1006 :     if (IsBootstrapProcessingMode())
    1051          888 :         return;
    1052              : 
    1053              :     /*
    1054              :      * Check for old recovery API file: recovery.conf
    1055              :      */
    1056          955 :     if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
    1057            0 :         ereport(FATAL,
    1058              :                 (errcode_for_file_access(),
    1059              :                  errmsg("using recovery command file \"%s\" is not supported",
    1060              :                         RECOVERY_COMMAND_FILE)));
    1061              : 
    1062              :     /*
    1063              :      * Remove unused .done file, if present. Ignore if absent.
    1064              :      */
    1065          955 :     unlink(RECOVERY_COMMAND_DONE);
    1066              : 
    1067              :     /*
    1068              :      * Check for recovery signal files and if found, fsync them since they
    1069              :      * represent server state information.  We don't sweat too much about the
    1070              :      * possibility of fsync failure, however.
    1071              :      */
    1072          955 :     if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
    1073              :     {
    1074              :         int         fd;
    1075              : 
    1076          113 :         fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
    1077              :                                S_IRUSR | S_IWUSR);
    1078          113 :         if (fd >= 0)
    1079              :         {
    1080          113 :             (void) pg_fsync(fd);
    1081          113 :             close(fd);
    1082              :         }
    1083          113 :         standby_signal_file_found = true;
    1084              :     }
    1085              : 
    1086          955 :     if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
    1087              :     {
    1088              :         int         fd;
    1089              : 
    1090            6 :         fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
    1091              :                                S_IRUSR | S_IWUSR);
    1092            6 :         if (fd >= 0)
    1093              :         {
    1094            6 :             (void) pg_fsync(fd);
    1095            6 :             close(fd);
    1096              :         }
    1097            6 :         recovery_signal_file_found = true;
    1098              :     }
    1099              : 
    1100              :     /*
    1101              :      * If both signal files are present, standby signal file takes precedence.
    1102              :      * If neither is present then we won't enter archive recovery.
    1103              :      */
    1104          955 :     StandbyModeRequested = false;
    1105          955 :     ArchiveRecoveryRequested = false;
    1106          955 :     if (standby_signal_file_found)
    1107              :     {
    1108          113 :         StandbyModeRequested = true;
    1109          113 :         ArchiveRecoveryRequested = true;
    1110              :     }
    1111          842 :     else if (recovery_signal_file_found)
    1112              :     {
    1113            5 :         StandbyModeRequested = false;
    1114            5 :         ArchiveRecoveryRequested = true;
    1115              :     }
    1116              :     else
    1117          837 :         return;
    1118              : 
    1119              :     /*
    1120              :      * We don't support standby mode in standalone backends; that requires
    1121              :      * other processes such as the WAL receiver to be alive.
    1122              :      */
    1123          118 :     if (StandbyModeRequested && !IsUnderPostmaster)
    1124            0 :         ereport(FATAL,
    1125              :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1126              :                  errmsg("standby mode is not supported by single-user servers")));
    1127              : }
    1128              : 
    1129              : static void
    1130         1006 : validateRecoveryParameters(void)
    1131              : {
    1132         1006 :     if (!ArchiveRecoveryRequested)
    1133          888 :         return;
    1134              : 
    1135              :     /*
    1136              :      * Check for compulsory parameters
    1137              :      */
    1138          118 :     if (StandbyModeRequested)
    1139              :     {
    1140          113 :         if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
    1141           12 :             (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
    1142            2 :             ereport(WARNING,
    1143              :                     (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
    1144              :                      errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
    1145              :     }
    1146              :     else
    1147              :     {
    1148            5 :         if (recoveryRestoreCommand == NULL ||
    1149            5 :             strcmp(recoveryRestoreCommand, "") == 0)
    1150            0 :             ereport(FATAL,
    1151              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1152              :                      errmsg("must specify \"restore_command\" when standby mode is not enabled")));
    1153              :     }
    1154              : 
    1155              :     /*
    1156              :      * Override any inconsistent requests. Note that this is a change of
    1157              :      * behaviour in 9.5; prior to this we simply ignored a request to pause if
    1158              :      * hot_standby = off, which was surprising behaviour.
    1159              :      */
    1160          118 :     if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
    1161          111 :         !EnableHotStandby)
    1162            3 :         recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
    1163              : 
    1164              :     /*
    1165              :      * Final parsing of recovery_target_time string; see also
    1166              :      * check_recovery_target_time().
    1167              :      */
    1168          118 :     if (recoveryTarget == RECOVERY_TARGET_TIME)
    1169              :     {
    1170            0 :         recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
    1171              :                                                                      CStringGetDatum(recovery_target_time_string),
    1172              :                                                                      ObjectIdGetDatum(InvalidOid),
    1173              :                                                                      Int32GetDatum(-1)));
    1174              :     }
    1175              : 
    1176              :     /*
    1177              :      * If user specified recovery_target_timeline, validate it or compute the
    1178              :      * "latest" value.  We can't do this until after we've gotten the restore
    1179              :      * command and set InArchiveRecovery, because we need to fetch timeline
    1180              :      * history files from the archive.
    1181              :      */
    1182          118 :     if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
    1183              :     {
    1184            0 :         TimeLineID  rtli = recoveryTargetTLIRequested;
    1185              : 
    1186              :         /* Timeline 1 does not have a history file, all else should */
    1187            0 :         if (rtli != 1 && !existsTimeLineHistory(rtli))
    1188            0 :             ereport(FATAL,
    1189              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1190              :                      errmsg("recovery target timeline %u does not exist",
    1191              :                             rtli)));
    1192            0 :         recoveryTargetTLI = rtli;
    1193              :     }
    1194          118 :     else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
    1195              :     {
    1196              :         /* We start the "latest" search from pg_control's timeline */
    1197          118 :         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
    1198              :     }
    1199              :     else
    1200              :     {
    1201              :         /*
    1202              :          * else we just use the recoveryTargetTLI as already read from
    1203              :          * ControlFile
    1204              :          */
    1205              :         Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
    1206              :     }
    1207              : }
    1208              : 
    1209              : /*
    1210              :  * read_backup_label: check to see if a backup_label file is present
    1211              :  *
    1212              :  * If we see a backup_label during recovery, we assume that we are recovering
    1213              :  * from a backup dump file, and we therefore roll forward from the checkpoint
    1214              :  * identified by the label file, NOT what pg_control says.  This avoids the
    1215              :  * problem that pg_control might have been archived one or more checkpoints
    1216              :  * later than the start of the dump, and so if we rely on it as the start
    1217              :  * point, we will fail to restore a consistent database state.
    1218              :  *
    1219              :  * Returns true if a backup_label was found (and fills the checkpoint
    1220              :  * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
    1221              :  * returns false if not. If this backup_label came from a streamed backup,
    1222              :  * *backupEndRequired is set to true. If this backup_label was created during
    1223              :  * recovery, *backupFromStandby is set to true.
    1224              :  *
    1225              :  * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
    1226              :  * and TLI read from the backup file.
    1227              :  */
    1228              : static bool
    1229         1006 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
    1230              :                   bool *backupEndRequired, bool *backupFromStandby)
    1231              : {
    1232              :     char        startxlogfilename[MAXFNAMELEN];
    1233              :     TimeLineID  tli_from_walseg,
    1234              :                 tli_from_file;
    1235              :     FILE       *lfp;
    1236              :     char        ch;
    1237              :     char        backuptype[20];
    1238              :     char        backupfrom[20];
    1239              :     char        backuplabel[MAXPGPATH];
    1240              :     char        backuptime[128];
    1241              :     uint32      hi,
    1242              :                 lo;
    1243              : 
    1244              :     /* suppress possible uninitialized-variable warnings */
    1245         1006 :     *checkPointLoc = InvalidXLogRecPtr;
    1246         1006 :     *backupLabelTLI = 0;
    1247         1006 :     *backupEndRequired = false;
    1248         1006 :     *backupFromStandby = false;
    1249              : 
    1250              :     /*
    1251              :      * See if label file is present
    1252              :      */
    1253         1006 :     lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
    1254         1006 :     if (!lfp)
    1255              :     {
    1256          925 :         if (errno != ENOENT)
    1257            0 :             ereport(FATAL,
    1258              :                     (errcode_for_file_access(),
    1259              :                      errmsg("could not read file \"%s\": %m",
    1260              :                             BACKUP_LABEL_FILE)));
    1261          925 :         return false;           /* it's not there, all is fine */
    1262              :     }
    1263              : 
    1264              :     /*
    1265              :      * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
    1266              :      * is pretty crude, but we are not expecting any variability in the file
    1267              :      * format).
    1268              :      */
    1269           81 :     if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
    1270           81 :                &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
    1271            0 :         ereport(FATAL,
    1272              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1273              :                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    1274           81 :     RedoStartLSN = ((uint64) hi) << 32 | lo;
    1275           81 :     RedoStartTLI = tli_from_walseg;
    1276           81 :     if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
    1277           81 :                &hi, &lo, &ch) != 3 || ch != '\n')
    1278            0 :         ereport(FATAL,
    1279              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1280              :                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    1281           81 :     *checkPointLoc = ((uint64) hi) << 32 | lo;
    1282           81 :     *backupLabelTLI = tli_from_walseg;
    1283              : 
    1284              :     /*
    1285              :      * BACKUP METHOD lets us know if this was a typical backup ("streamed",
    1286              :      * which could mean either pg_basebackup or the pg_backup_start/stop
    1287              :      * method was used) or if this label came from somewhere else (the only
    1288              :      * other option today being from pg_rewind).  If this was a streamed
    1289              :      * backup then we know that we need to play through until we get to the
    1290              :      * end of the WAL which was generated during the backup (at which point we
    1291              :      * will have reached consistency and backupEndRequired will be reset to be
    1292              :      * false).
    1293              :      */
    1294           81 :     if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
    1295              :     {
    1296           81 :         if (strcmp(backuptype, "streamed") == 0)
    1297           80 :             *backupEndRequired = true;
    1298              :     }
    1299              : 
    1300              :     /*
    1301              :      * BACKUP FROM lets us know if this was from a primary or a standby.  If
    1302              :      * it was from a standby, we'll double-check that the control file state
    1303              :      * matches that of a standby.
    1304              :      */
    1305           81 :     if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
    1306              :     {
    1307           81 :         if (strcmp(backupfrom, "standby") == 0)
    1308            5 :             *backupFromStandby = true;
    1309              :     }
    1310              : 
    1311              :     /*
    1312              :      * Parse START TIME and LABEL. Those are not mandatory fields for recovery
    1313              :      * but checking for their presence is useful for debugging and the next
    1314              :      * sanity checks. Cope also with the fact that the result buffers have a
    1315              :      * pre-allocated size, hence if the backup_label file has been generated
    1316              :      * with strings longer than the maximum assumed here an incorrect parsing
    1317              :      * happens. That's fine as only minor consistency checks are done
    1318              :      * afterwards.
    1319              :      */
    1320           81 :     if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
    1321           81 :         ereport(DEBUG1,
    1322              :                 (errmsg_internal("backup time %s in file \"%s\"",
    1323              :                                  backuptime, BACKUP_LABEL_FILE)));
    1324              : 
    1325           81 :     if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
    1326           80 :         ereport(DEBUG1,
    1327              :                 (errmsg_internal("backup label %s in file \"%s\"",
    1328              :                                  backuplabel, BACKUP_LABEL_FILE)));
    1329              : 
    1330              :     /*
    1331              :      * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
    1332              :      * it as a sanity check if present.
    1333              :      */
    1334           81 :     if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
    1335              :     {
    1336           80 :         if (tli_from_walseg != tli_from_file)
    1337            0 :             ereport(FATAL,
    1338              :                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1339              :                      errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
    1340              :                      errdetail("Timeline ID parsed is %u, but expected %u.",
    1341              :                                tli_from_file, tli_from_walseg)));
    1342              : 
    1343           80 :         ereport(DEBUG1,
    1344              :                 (errmsg_internal("backup timeline %u in file \"%s\"",
    1345              :                                  tli_from_file, BACKUP_LABEL_FILE)));
    1346              :     }
    1347              : 
    1348           81 :     if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
    1349            0 :         ereport(FATAL,
    1350              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1351              :                  errmsg("this is an incremental backup, not a data directory"),
    1352              :                  errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
    1353              : 
    1354           81 :     if (ferror(lfp) || FreeFile(lfp))
    1355            0 :         ereport(FATAL,
    1356              :                 (errcode_for_file_access(),
    1357              :                  errmsg("could not read file \"%s\": %m",
    1358              :                         BACKUP_LABEL_FILE)));
    1359              : 
    1360           81 :     return true;
    1361              : }
    1362              : 
    1363              : /*
    1364              :  * read_tablespace_map: check to see if a tablespace_map file is present
    1365              :  *
    1366              :  * If we see a tablespace_map file during recovery, we assume that we are
    1367              :  * recovering from a backup dump file, and we therefore need to create symlinks
    1368              :  * as per the information present in tablespace_map file.
    1369              :  *
    1370              :  * Returns true if a tablespace_map file was found (and fills *tablespaces
    1371              :  * with a tablespaceinfo struct for each tablespace listed in the file);
    1372              :  * returns false if not.
    1373              :  */
    1374              : static bool
    1375           81 : read_tablespace_map(List **tablespaces)
    1376              : {
    1377              :     tablespaceinfo *ti;
    1378              :     FILE       *lfp;
    1379              :     char        str[MAXPGPATH];
    1380              :     int         ch,
    1381              :                 i,
    1382              :                 n;
    1383              :     bool        was_backslash;
    1384              : 
    1385              :     /*
    1386              :      * See if tablespace_map file is present
    1387              :      */
    1388           81 :     lfp = AllocateFile(TABLESPACE_MAP, "r");
    1389           81 :     if (!lfp)
    1390              :     {
    1391           79 :         if (errno != ENOENT)
    1392            0 :             ereport(FATAL,
    1393              :                     (errcode_for_file_access(),
    1394              :                      errmsg("could not read file \"%s\": %m",
    1395              :                             TABLESPACE_MAP)));
    1396           79 :         return false;           /* it's not there, all is fine */
    1397              :     }
    1398              : 
    1399              :     /*
    1400              :      * Read and parse the link name and path lines from tablespace_map file
    1401              :      * (this code is pretty crude, but we are not expecting any variability in
    1402              :      * the file format).  De-escape any backslashes that were inserted.
    1403              :      */
    1404            2 :     i = 0;
    1405            2 :     was_backslash = false;
    1406           77 :     while ((ch = fgetc(lfp)) != EOF)
    1407              :     {
    1408           75 :         if (!was_backslash && (ch == '\n' || ch == '\r'))
    1409            2 :         {
    1410              :             char       *endp;
    1411              : 
    1412            2 :             if (i == 0)
    1413            0 :                 continue;       /* \r immediately followed by \n */
    1414              : 
    1415              :             /*
    1416              :              * The de-escaped line should contain an OID followed by exactly
    1417              :              * one space followed by a path.  The path might start with
    1418              :              * spaces, so don't be too liberal about parsing.
    1419              :              */
    1420            2 :             str[i] = '\0';
    1421            2 :             n = 0;
    1422           12 :             while (str[n] && str[n] != ' ')
    1423           10 :                 n++;
    1424            2 :             if (n < 1 || n >= i - 1)
    1425            0 :                 ereport(FATAL,
    1426              :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1427              :                          errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
    1428            2 :             str[n++] = '\0';
    1429              : 
    1430            2 :             ti = palloc0_object(tablespaceinfo);
    1431            2 :             errno = 0;
    1432            2 :             ti->oid = strtoul(str, &endp, 10);
    1433            2 :             if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
    1434            0 :                 ereport(FATAL,
    1435              :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1436              :                          errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
    1437            2 :             ti->path = pstrdup(str + n);
    1438            2 :             *tablespaces = lappend(*tablespaces, ti);
    1439              : 
    1440            2 :             i = 0;
    1441            2 :             continue;
    1442              :         }
    1443           73 :         else if (!was_backslash && ch == '\\')
    1444            0 :             was_backslash = true;
    1445              :         else
    1446              :         {
    1447           73 :             if (i < sizeof(str) - 1)
    1448           73 :                 str[i++] = ch;
    1449           73 :             was_backslash = false;
    1450              :         }
    1451              :     }
    1452              : 
    1453            2 :     if (i != 0 || was_backslash)    /* last line not terminated? */
    1454            0 :         ereport(FATAL,
    1455              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1456              :                  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
    1457              : 
    1458            2 :     if (ferror(lfp) || FreeFile(lfp))
    1459            0 :         ereport(FATAL,
    1460              :                 (errcode_for_file_access(),
    1461              :                  errmsg("could not read file \"%s\": %m",
    1462              :                         TABLESPACE_MAP)));
    1463              : 
    1464            2 :     return true;
    1465              : }
    1466              : 
    1467              : /*
    1468              :  * Finish WAL recovery.
    1469              :  *
    1470              :  * This does not close the 'xlogreader' yet, because in some cases the caller
    1471              :  * still wants to re-read the last checkpoint record by calling
    1472              :  * ReadCheckpointRecord().
    1473              :  *
    1474              :  * Returns the position of the last valid or applied record, after which new
    1475              :  * WAL should be appended, information about why recovery was ended, and some
    1476              :  * other things. See the EndOfWalRecoveryInfo struct for details.
    1477              :  */
    1478              : EndOfWalRecoveryInfo *
    1479          941 : FinishWalRecovery(void)
    1480              : {
    1481          941 :     EndOfWalRecoveryInfo *result = palloc_object(EndOfWalRecoveryInfo);
    1482              :     XLogRecPtr  lastRec;
    1483              :     TimeLineID  lastRecTLI;
    1484              :     XLogRecPtr  endOfLog;
    1485              : 
    1486              :     /*
    1487              :      * Kill WAL receiver, if it's still running, before we continue to write
    1488              :      * the startup checkpoint and aborted-contrecord records. It will trump
    1489              :      * over these records and subsequent ones if it's still alive when we
    1490              :      * start writing WAL.
    1491              :      */
    1492          941 :     XLogShutdownWalRcv();
    1493              : 
    1494              :     /*
    1495              :      * Shutdown the slot sync worker to drop any temporary slots acquired by
    1496              :      * it and to prevent it from keep trying to fetch the failover slots.
    1497              :      *
    1498              :      * We do not update the 'synced' column in 'pg_replication_slots' system
    1499              :      * view from true to false here, as any failed update could leave 'synced'
    1500              :      * column false for some slots. This could cause issues during slot sync
    1501              :      * after restarting the server as a standby. While updating the 'synced'
    1502              :      * column after switching to the new timeline is an option, it does not
    1503              :      * simplify the handling for the 'synced' column. Therefore, we retain the
    1504              :      * 'synced' column as true after promotion as it may provide useful
    1505              :      * information about the slot origin.
    1506              :      */
    1507          941 :     ShutDownSlotSync();
    1508              : 
    1509              :     /*
    1510              :      * We are now done reading the xlog from stream. Turn off streaming
    1511              :      * recovery to force fetching the files (which would be required at end of
    1512              :      * recovery, e.g., timeline history file) from archive or pg_wal.
    1513              :      *
    1514              :      * Note that standby mode must be turned off after killing WAL receiver,
    1515              :      * i.e., calling XLogShutdownWalRcv().
    1516              :      */
    1517              :     Assert(!WalRcvStreaming());
    1518          941 :     StandbyMode = false;
    1519              : 
    1520              :     /*
    1521              :      * Determine where to start writing WAL next.
    1522              :      *
    1523              :      * Re-fetch the last valid or last applied record, so we can identify the
    1524              :      * exact endpoint of what we consider the valid portion of WAL.  There may
    1525              :      * be an incomplete continuation record after that, in which case
    1526              :      * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
    1527              :      * write a special OVERWRITE_CONTRECORD message to mark that the rest of
    1528              :      * it is intentionally missing.  See CreateOverwriteContrecordRecord().
    1529              :      *
    1530              :      * An important side-effect of this is to load the last page into
    1531              :      * xlogreader. The caller uses it to initialize the WAL for writing.
    1532              :      */
    1533          941 :     if (!InRecovery)
    1534              :     {
    1535          778 :         lastRec = CheckPointLoc;
    1536          778 :         lastRecTLI = CheckPointTLI;
    1537              :     }
    1538              :     else
    1539              :     {
    1540          163 :         lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
    1541          163 :         lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
    1542              :     }
    1543          941 :     XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
    1544          941 :     (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
    1545          941 :     endOfLog = xlogreader->EndRecPtr;
    1546              : 
    1547              :     /*
    1548              :      * Remember the TLI in the filename of the XLOG segment containing the
    1549              :      * end-of-log.  It could be different from the timeline that endOfLog
    1550              :      * nominally belongs to, if there was a timeline switch in that segment,
    1551              :      * and we were reading the old WAL from a segment belonging to a higher
    1552              :      * timeline.
    1553              :      */
    1554          941 :     result->endOfLogTLI = xlogreader->seg.ws_tli;
    1555              : 
    1556          941 :     if (ArchiveRecoveryRequested)
    1557              :     {
    1558              :         /*
    1559              :          * We are no longer in archive recovery state.
    1560              :          *
    1561              :          * We are now done reading the old WAL.  Turn off archive fetching if
    1562              :          * it was active.
    1563              :          */
    1564              :         Assert(InArchiveRecovery);
    1565           55 :         InArchiveRecovery = false;
    1566              : 
    1567              :         /*
    1568              :          * If the ending log segment is still open, close it (to avoid
    1569              :          * problems on Windows with trying to rename or delete an open file).
    1570              :          */
    1571           55 :         if (readFile >= 0)
    1572              :         {
    1573           55 :             close(readFile);
    1574           55 :             readFile = -1;
    1575              :         }
    1576              :     }
    1577              : 
    1578              :     /*
    1579              :      * Copy the last partial block to the caller, for initializing the WAL
    1580              :      * buffer for appending new WAL.
    1581              :      */
    1582          941 :     if (endOfLog % XLOG_BLCKSZ != 0)
    1583              :     {
    1584              :         char       *page;
    1585              :         int         len;
    1586              :         XLogRecPtr  pageBeginPtr;
    1587              : 
    1588          920 :         pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
    1589              :         Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
    1590              : 
    1591              :         /* Copy the valid part of the last block */
    1592          920 :         len = endOfLog % XLOG_BLCKSZ;
    1593          920 :         page = palloc(len);
    1594          920 :         memcpy(page, xlogreader->readBuf, len);
    1595              : 
    1596          920 :         result->lastPageBeginPtr = pageBeginPtr;
    1597          920 :         result->lastPage = page;
    1598              :     }
    1599              :     else
    1600              :     {
    1601              :         /* There is no partial block to copy. */
    1602           21 :         result->lastPageBeginPtr = endOfLog;
    1603           21 :         result->lastPage = NULL;
    1604              :     }
    1605              : 
    1606              :     /*
    1607              :      * Create a comment for the history file to explain why and where timeline
    1608              :      * changed.
    1609              :      */
    1610          941 :     result->recoveryStopReason = getRecoveryStopReason();
    1611              : 
    1612          941 :     result->lastRec = lastRec;
    1613          941 :     result->lastRecTLI = lastRecTLI;
    1614          941 :     result->endOfLog = endOfLog;
    1615              : 
    1616          941 :     result->abortedRecPtr = abortedRecPtr;
    1617          941 :     result->missingContrecPtr = missingContrecPtr;
    1618              : 
    1619          941 :     result->standby_signal_file_found = standby_signal_file_found;
    1620          941 :     result->recovery_signal_file_found = recovery_signal_file_found;
    1621              : 
    1622          941 :     return result;
    1623              : }
    1624              : 
    1625              : /*
    1626              :  * Clean up the WAL reader and leftovers from restoring WAL from archive
    1627              :  */
    1628              : void
    1629          941 : ShutdownWalRecovery(void)
    1630              : {
    1631              :     char        recoveryPath[MAXPGPATH];
    1632              : 
    1633              :     /* Final update of pg_stat_recovery_prefetch. */
    1634          941 :     XLogPrefetcherComputeStats(xlogprefetcher);
    1635              : 
    1636              :     /* Shut down xlogreader */
    1637          941 :     if (readFile >= 0)
    1638              :     {
    1639          886 :         close(readFile);
    1640          886 :         readFile = -1;
    1641              :     }
    1642          941 :     pfree(xlogreader->private_data);
    1643          941 :     XLogReaderFree(xlogreader);
    1644          941 :     XLogPrefetcherFree(xlogprefetcher);
    1645              : 
    1646          941 :     if (ArchiveRecoveryRequested)
    1647              :     {
    1648              :         /*
    1649              :          * Since there might be a partial WAL segment named RECOVERYXLOG, get
    1650              :          * rid of it.
    1651              :          */
    1652           55 :         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
    1653           55 :         unlink(recoveryPath);   /* ignore any error */
    1654              : 
    1655              :         /* Get rid of any remaining recovered timeline-history file, too */
    1656           55 :         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
    1657           55 :         unlink(recoveryPath);   /* ignore any error */
    1658              :     }
    1659              : 
    1660              :     /*
    1661              :      * We don't need the latch anymore. It's not strictly necessary to disown
    1662              :      * it, but let's do it for the sake of tidiness.
    1663              :      */
    1664          941 :     if (ArchiveRecoveryRequested)
    1665           55 :         DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
    1666          941 : }
    1667              : 
    1668              : /*
    1669              :  * Perform WAL recovery.
    1670              :  *
    1671              :  * If the system was shut down cleanly, this is never called.
    1672              :  */
    1673              : void
    1674          225 : PerformWalRecovery(void)
    1675              : {
    1676              :     XLogRecord *record;
    1677          225 :     bool        reachedRecoveryTarget = false;
    1678              :     TimeLineID  replayTLI;
    1679              : 
    1680              :     /*
    1681              :      * Initialize shared variables for tracking progress of WAL replay, as if
    1682              :      * we had just replayed the record before the REDO location (or the
    1683              :      * checkpoint record itself, if it's a shutdown checkpoint).
    1684              :      */
    1685          225 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    1686          225 :     if (RedoStartLSN < CheckPointLoc)
    1687              :     {
    1688          123 :         XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
    1689          123 :         XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
    1690          123 :         XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
    1691              :     }
    1692              :     else
    1693              :     {
    1694          102 :         XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
    1695          102 :         XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
    1696          102 :         XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
    1697              :     }
    1698          225 :     XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
    1699          225 :     XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
    1700          225 :     XLogRecoveryCtl->recoveryLastXTime = 0;
    1701          225 :     XLogRecoveryCtl->currentChunkStartTime = 0;
    1702          225 :     XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
    1703          225 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    1704              : 
    1705              :     /* Also ensure XLogReceiptTime has a sane value */
    1706          225 :     XLogReceiptTime = GetCurrentTimestamp();
    1707              : 
    1708              :     /*
    1709              :      * Let postmaster know we've started redo now, so that it can launch the
    1710              :      * archiver if necessary.
    1711              :      */
    1712          225 :     if (IsUnderPostmaster)
    1713          216 :         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
    1714              : 
    1715              :     /*
    1716              :      * Allow read-only connections immediately if we're consistent already.
    1717              :      */
    1718          225 :     CheckRecoveryConsistency();
    1719              : 
    1720              :     /*
    1721              :      * Find the first record that logically follows the checkpoint --- it
    1722              :      * might physically precede it, though.
    1723              :      */
    1724          225 :     if (RedoStartLSN < CheckPointLoc)
    1725              :     {
    1726              :         /* back up to find the record */
    1727          123 :         replayTLI = RedoStartTLI;
    1728          123 :         XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
    1729          123 :         record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
    1730              : 
    1731              :         /*
    1732              :          * If a checkpoint record's redo pointer points back to an earlier
    1733              :          * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
    1734              :          * record.
    1735              :          */
    1736          123 :         if (record->xl_rmid != RM_XLOG_ID ||
    1737          123 :             (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
    1738            0 :             ereport(FATAL,
    1739              :                     errmsg("unexpected record type found at redo point %X/%08X",
    1740              :                            LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
    1741              :     }
    1742              :     else
    1743              :     {
    1744              :         /* just have to read next record after CheckPoint */
    1745              :         Assert(xlogreader->ReadRecPtr == CheckPointLoc);
    1746          102 :         replayTLI = CheckPointTLI;
    1747          102 :         record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
    1748              :     }
    1749              : 
    1750          225 :     if (record != NULL)
    1751              :     {
    1752              :         TimestampTz xtime;
    1753              :         PGRUsage    ru0;
    1754              : 
    1755          216 :         pg_rusage_init(&ru0);
    1756              : 
    1757          216 :         InRedo = true;
    1758              : 
    1759          216 :         RmgrStartup();
    1760              : 
    1761          216 :         ereport(LOG,
    1762              :                 errmsg("redo starts at %X/%08X",
    1763              :                        LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
    1764              : 
    1765              :         /* Prepare to report progress of the redo phase. */
    1766          216 :         if (!StandbyMode)
    1767          109 :             begin_startup_progress_phase();
    1768              : 
    1769              :         /*
    1770              :          * main redo apply loop
    1771              :          */
    1772              :         do
    1773              :         {
    1774      2797193 :             if (!StandbyMode)
    1775       263715 :                 ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
    1776              :                                          LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
    1777              : 
    1778              : #ifdef WAL_DEBUG
    1779              :             if (XLOG_DEBUG)
    1780              :             {
    1781              :                 StringInfoData buf;
    1782              : 
    1783              :                 initStringInfo(&buf);
    1784              :                 appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
    1785              :                                  LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
    1786              :                                  LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
    1787              :                 xlog_outrec(&buf, xlogreader);
    1788              :                 appendStringInfoString(&buf, " - ");
    1789              :                 xlog_outdesc(&buf, xlogreader);
    1790              :                 elog(LOG, "%s", buf.data);
    1791              :                 pfree(buf.data);
    1792              :             }
    1793              : #endif
    1794              : 
    1795              :             /* Handle interrupt signals of startup process */
    1796      2797193 :             ProcessStartupProcInterrupts();
    1797              : 
    1798              :             /*
    1799              :              * Pause WAL replay, if requested by a hot-standby session via
    1800              :              * SetRecoveryPause().
    1801              :              *
    1802              :              * Note that we intentionally don't take the info_lck spinlock
    1803              :              * here.  We might therefore read a slightly stale value of the
    1804              :              * recoveryPause flag, but it can't be very stale (no worse than
    1805              :              * the last spinlock we did acquire).  Since a pause request is a
    1806              :              * pretty asynchronous thing anyway, possibly responding to it one
    1807              :              * WAL record later than we otherwise would is a minor issue, so
    1808              :              * it doesn't seem worth adding another spinlock cycle to prevent
    1809              :              * that.
    1810              :              */
    1811      2797193 :             if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
    1812              :                 RECOVERY_NOT_PAUSED)
    1813            0 :                 recoveryPausesHere(false);
    1814              : 
    1815              :             /*
    1816              :              * Have we reached our recovery target?
    1817              :              */
    1818      2797193 :             if (recoveryStopsBefore(xlogreader))
    1819              :             {
    1820            2 :                 reachedRecoveryTarget = true;
    1821            2 :                 break;
    1822              :             }
    1823              : 
    1824              :             /*
    1825              :              * If we've been asked to lag the primary, wait on latch until
    1826              :              * enough time has passed.
    1827              :              */
    1828      2797191 :             if (recoveryApplyDelay(xlogreader))
    1829              :             {
    1830              :                 /*
    1831              :                  * We test for paused recovery again here. If user sets
    1832              :                  * delayed apply, it may be because they expect to pause
    1833              :                  * recovery in case of problems, so we must test again here
    1834              :                  * otherwise pausing during the delay-wait wouldn't work.
    1835              :                  */
    1836           28 :                 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
    1837              :                     RECOVERY_NOT_PAUSED)
    1838            0 :                     recoveryPausesHere(false);
    1839              :             }
    1840              : 
    1841              :             /*
    1842              :              * Apply the record
    1843              :              */
    1844      2797191 :             ApplyWalRecord(xlogreader, record, &replayTLI);
    1845              : 
    1846              :             /*
    1847              :              * If we replayed an LSN that someone was waiting for then walk
    1848              :              * over the shared memory array and set latches to notify the
    1849              :              * waiters.
    1850              :              */
    1851      5594378 :             if (waitLSNState &&
    1852      2797189 :                 (XLogRecoveryCtl->lastReplayedEndRecPtr >=
    1853      2797189 :                  pg_atomic_read_u64(&waitLSNState->minWaitedLSN[WAIT_LSN_TYPE_STANDBY_REPLAY])))
    1854            8 :                 WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_REPLAY, XLogRecoveryCtl->lastReplayedEndRecPtr);
    1855              : 
    1856              :             /* Exit loop if we reached inclusive recovery target */
    1857      2797189 :             if (recoveryStopsAfter(xlogreader))
    1858              :             {
    1859            5 :                 reachedRecoveryTarget = true;
    1860            5 :                 break;
    1861              :             }
    1862              : 
    1863              :             /* Else, try to fetch the next WAL record */
    1864      2797184 :             record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
    1865      2797125 :         } while (record != NULL);
    1866              : 
    1867              :         /*
    1868              :          * end of main redo apply loop
    1869              :          */
    1870              : 
    1871          155 :         if (reachedRecoveryTarget)
    1872              :         {
    1873            7 :             if (!reachedConsistency)
    1874            0 :                 ereport(FATAL,
    1875              :                         (errmsg("requested recovery stop point is before consistent recovery point")));
    1876              : 
    1877              :             /*
    1878              :              * This is the last point where we can restart recovery with a new
    1879              :              * recovery target, if we shutdown and begin again. After this,
    1880              :              * Resource Managers may choose to do permanent corrective actions
    1881              :              * at end of recovery.
    1882              :              */
    1883            7 :             switch (recoveryTargetAction)
    1884              :             {
    1885            0 :                 case RECOVERY_TARGET_ACTION_SHUTDOWN:
    1886              : 
    1887              :                     /*
    1888              :                      * exit with special return code to request shutdown of
    1889              :                      * postmaster.  Log messages issued from postmaster.
    1890              :                      */
    1891            0 :                     proc_exit(3);
    1892              : 
    1893            1 :                 case RECOVERY_TARGET_ACTION_PAUSE:
    1894            1 :                     SetRecoveryPause(true);
    1895            1 :                     recoveryPausesHere(true);
    1896              : 
    1897              :                     /* drop into promote */
    1898              :                     pg_fallthrough;
    1899              : 
    1900            7 :                 case RECOVERY_TARGET_ACTION_PROMOTE:
    1901            7 :                     break;
    1902              :             }
    1903              :         }
    1904              : 
    1905          155 :         RmgrCleanup();
    1906              : 
    1907          155 :         ereport(LOG,
    1908              :                 errmsg("redo done at %X/%08X system usage: %s",
    1909              :                        LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
    1910              :                        pg_rusage_show(&ru0)));
    1911          155 :         xtime = GetLatestXTime();
    1912          155 :         if (xtime)
    1913           38 :             ereport(LOG,
    1914              :                     (errmsg("last completed transaction was at log time %s",
    1915              :                             timestamptz_to_str(xtime))));
    1916              : 
    1917          155 :         InRedo = false;
    1918              :     }
    1919              :     else
    1920              :     {
    1921              :         /* there are no WAL records following the checkpoint */
    1922            9 :         ereport(LOG,
    1923              :                 (errmsg("redo is not required")));
    1924              :     }
    1925              : 
    1926              :     /*
    1927              :      * This check is intentionally after the above log messages that indicate
    1928              :      * how far recovery went.
    1929              :      */
    1930          164 :     if (ArchiveRecoveryRequested &&
    1931           56 :         recoveryTarget != RECOVERY_TARGET_UNSET &&
    1932            8 :         !reachedRecoveryTarget)
    1933            1 :         ereport(FATAL,
    1934              :                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
    1935              :                  errmsg("recovery ended before configured recovery target was reached")));
    1936          163 : }
    1937              : 
    1938              : /*
    1939              :  * Subroutine of PerformWalRecovery, to apply one WAL record.
    1940              :  */
    1941              : static void
    1942      2797191 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
    1943              : {
    1944              :     ErrorContextCallback errcallback;
    1945      2797191 :     bool        switchedTLI = false;
    1946              : 
    1947              :     /* Setup error traceback support for ereport() */
    1948      2797191 :     errcallback.callback = rm_redo_error_callback;
    1949      2797191 :     errcallback.arg = xlogreader;
    1950      2797191 :     errcallback.previous = error_context_stack;
    1951      2797191 :     error_context_stack = &errcallback;
    1952              : 
    1953              :     /*
    1954              :      * TransamVariables->nextXid must be beyond record's xid.
    1955              :      */
    1956      2797191 :     AdvanceNextFullTransactionIdPastXid(record->xl_xid);
    1957              : 
    1958              :     /*
    1959              :      * Before replaying this record, check if this record causes the current
    1960              :      * timeline to change. The record is already considered to be part of the
    1961              :      * new timeline, so we update replayTLI before replaying it. That's
    1962              :      * important so that replayEndTLI, which is recorded as the minimum
    1963              :      * recovery point's TLI if recovery stops after this record, is set
    1964              :      * correctly.
    1965              :      */
    1966      2797191 :     if (record->xl_rmid == RM_XLOG_ID)
    1967              :     {
    1968        44606 :         TimeLineID  newReplayTLI = *replayTLI;
    1969        44606 :         TimeLineID  prevReplayTLI = *replayTLI;
    1970        44606 :         uint8       info = record->xl_info & ~XLR_INFO_MASK;
    1971              : 
    1972        44606 :         if (info == XLOG_CHECKPOINT_SHUTDOWN)
    1973              :         {
    1974              :             CheckPoint  checkPoint;
    1975              : 
    1976           39 :             memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
    1977           39 :             newReplayTLI = checkPoint.ThisTimeLineID;
    1978           39 :             prevReplayTLI = checkPoint.PrevTimeLineID;
    1979              :         }
    1980        44567 :         else if (info == XLOG_END_OF_RECOVERY)
    1981              :         {
    1982              :             xl_end_of_recovery xlrec;
    1983              : 
    1984           11 :             memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
    1985           11 :             newReplayTLI = xlrec.ThisTimeLineID;
    1986           11 :             prevReplayTLI = xlrec.PrevTimeLineID;
    1987              :         }
    1988              : 
    1989        44606 :         if (newReplayTLI != *replayTLI)
    1990              :         {
    1991              :             /* Check that it's OK to switch to this TLI */
    1992           12 :             checkTimeLineSwitch(xlogreader->EndRecPtr,
    1993              :                                 newReplayTLI, prevReplayTLI, *replayTLI);
    1994              : 
    1995              :             /* Following WAL records should be run with new TLI */
    1996           12 :             *replayTLI = newReplayTLI;
    1997           12 :             switchedTLI = true;
    1998              :         }
    1999              :     }
    2000              : 
    2001              :     /*
    2002              :      * Update shared replayEndRecPtr before replaying this record, so that
    2003              :      * XLogFlush will update minRecoveryPoint correctly.
    2004              :      */
    2005      2797191 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    2006      2797191 :     XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
    2007      2797191 :     XLogRecoveryCtl->replayEndTLI = *replayTLI;
    2008      2797191 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    2009              : 
    2010              :     /*
    2011              :      * If we are attempting to enter Hot Standby mode, process XIDs we see
    2012              :      */
    2013      2797191 :     if (standbyState >= STANDBY_INITIALIZED &&
    2014      2553283 :         TransactionIdIsValid(record->xl_xid))
    2015      2498833 :         RecordKnownAssignedTransactionIds(record->xl_xid);
    2016              : 
    2017              :     /*
    2018              :      * Some XLOG record types that are related to recovery are processed
    2019              :      * directly here, rather than in xlog_redo()
    2020              :      */
    2021      2797191 :     if (record->xl_rmid == RM_XLOG_ID)
    2022        44606 :         xlogrecovery_redo(xlogreader, *replayTLI);
    2023              : 
    2024              :     /* Now apply the WAL record itself */
    2025      2797191 :     GetRmgr(record->xl_rmid).rm_redo(xlogreader);
    2026              : 
    2027              :     /*
    2028              :      * After redo, check whether the backup pages associated with the WAL
    2029              :      * record are consistent with the existing pages. This check is done only
    2030              :      * if consistency check is enabled for this record.
    2031              :      */
    2032      2797189 :     if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
    2033      2211560 :         verifyBackupPageConsistency(xlogreader);
    2034              : 
    2035              :     /* Pop the error context stack */
    2036      2797189 :     error_context_stack = errcallback.previous;
    2037              : 
    2038              :     /*
    2039              :      * Update lastReplayedEndRecPtr after this record has been successfully
    2040              :      * replayed.
    2041              :      */
    2042      2797189 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    2043      2797189 :     XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
    2044      2797189 :     XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
    2045      2797189 :     XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
    2046      2797189 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    2047              : 
    2048              :     /* ------
    2049              :      * Wakeup walsenders:
    2050              :      *
    2051              :      * On the standby, the WAL is flushed first (which will only wake up
    2052              :      * physical walsenders) and then applied, which will only wake up logical
    2053              :      * walsenders.
    2054              :      *
    2055              :      * Indeed, logical walsenders on standby can't decode and send data until
    2056              :      * it's been applied.
    2057              :      *
    2058              :      * Physical walsenders don't need to be woken up during replay unless
    2059              :      * cascading replication is allowed and time line change occurred (so that
    2060              :      * they can notice that they are on a new time line).
    2061              :      *
    2062              :      * That's why the wake up conditions are for:
    2063              :      *
    2064              :      *  - physical walsenders in case of new time line and cascade
    2065              :      *    replication is allowed
    2066              :      *  - logical walsenders in case cascade replication is allowed (could not
    2067              :      *    be created otherwise)
    2068              :      * ------
    2069              :      */
    2070      2797189 :     if (AllowCascadeReplication())
    2071      2607858 :         WalSndWakeup(switchedTLI, true);
    2072              : 
    2073              :     /*
    2074              :      * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
    2075              :      * receiver so that it notices the updated lastReplayedEndRecPtr and sends
    2076              :      * a reply to the primary.
    2077              :      */
    2078      2797189 :     if (doRequestWalReceiverReply)
    2079              :     {
    2080            2 :         doRequestWalReceiverReply = false;
    2081            2 :         WalRcvForceReply();
    2082              :     }
    2083              : 
    2084              :     /* Allow read-only connections if we're consistent now */
    2085      2797189 :     CheckRecoveryConsistency();
    2086              : 
    2087              :     /* Is this a timeline switch? */
    2088      2797189 :     if (switchedTLI)
    2089              :     {
    2090              :         /*
    2091              :          * Before we continue on the new timeline, clean up any (possibly
    2092              :          * bogus) future WAL segments on the old timeline.
    2093              :          */
    2094           12 :         RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
    2095              : 
    2096              :         /* Reset the prefetcher. */
    2097           12 :         XLogPrefetchReconfigure();
    2098              :     }
    2099      2797189 : }
    2100              : 
    2101              : /*
    2102              :  * Some XLOG RM record types that are directly related to WAL recovery are
    2103              :  * handled here rather than in the xlog_redo()
    2104              :  */
    2105              : static void
    2106        44606 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
    2107              : {
    2108        44606 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    2109        44606 :     XLogRecPtr  lsn = record->EndRecPtr;
    2110              : 
    2111              :     Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
    2112              : 
    2113        44606 :     if (info == XLOG_OVERWRITE_CONTRECORD)
    2114              :     {
    2115              :         /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
    2116              :         xl_overwrite_contrecord xlrec;
    2117              : 
    2118            1 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
    2119            1 :         if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
    2120            0 :             elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
    2121              :                  LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
    2122              :                  LSN_FORMAT_ARGS(record->overwrittenRecPtr));
    2123              : 
    2124              :         /* We have safely skipped the aborted record */
    2125            1 :         abortedRecPtr = InvalidXLogRecPtr;
    2126            1 :         missingContrecPtr = InvalidXLogRecPtr;
    2127              : 
    2128            1 :         ereport(LOG,
    2129              :                 errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
    2130              :                        LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
    2131              :                        timestamptz_to_str(xlrec.overwrite_time)));
    2132              : 
    2133              :         /* Verifying the record should only happen once */
    2134            1 :         record->overwrittenRecPtr = InvalidXLogRecPtr;
    2135              :     }
    2136        44605 :     else if (info == XLOG_BACKUP_END)
    2137              :     {
    2138              :         XLogRecPtr  startpoint;
    2139              : 
    2140           96 :         memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
    2141              : 
    2142           96 :         if (backupStartPoint == startpoint)
    2143              :         {
    2144              :             /*
    2145              :              * We have reached the end of base backup, the point where
    2146              :              * pg_backup_stop() was done.  The data on disk is now consistent
    2147              :              * (assuming we have also reached minRecoveryPoint).  Set
    2148              :              * backupEndPoint to the current LSN, so that the next call to
    2149              :              * CheckRecoveryConsistency() will notice it and do the
    2150              :              * end-of-backup processing.
    2151              :              */
    2152           79 :             elog(DEBUG1, "end of backup record reached");
    2153              : 
    2154           79 :             backupEndPoint = lsn;
    2155              :         }
    2156              :         else
    2157           17 :             elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
    2158              :                  LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
    2159              :     }
    2160        44606 : }
    2161              : 
    2162              : /*
    2163              :  * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
    2164              :  * directories.
    2165              :  *
    2166              :  * Replay of database creation XLOG records for databases that were later
    2167              :  * dropped can create fake directories in pg_tblspc.  By the time consistency
    2168              :  * is reached these directories should have been removed; here we verify
    2169              :  * that this did indeed happen.  This is to be called at the point where
    2170              :  * consistent state is reached.
    2171              :  *
    2172              :  * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
    2173              :  * useful for testing purposes, and also allows for an escape hatch in case
    2174              :  * things go south.
    2175              :  */
    2176              : static void
    2177          124 : CheckTablespaceDirectory(void)
    2178              : {
    2179              :     DIR        *dir;
    2180              :     struct dirent *de;
    2181              : 
    2182          124 :     dir = AllocateDir(PG_TBLSPC_DIR);
    2183          379 :     while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
    2184              :     {
    2185              :         char        path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
    2186              : 
    2187              :         /* Skip entries of non-oid names */
    2188          255 :         if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
    2189          248 :             continue;
    2190              : 
    2191            7 :         snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
    2192              : 
    2193            7 :         if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
    2194            4 :             ereport(allow_in_place_tablespaces ? WARNING : PANIC,
    2195              :                     (errcode(ERRCODE_DATA_CORRUPTED),
    2196              :                      errmsg("unexpected directory entry \"%s\" found in %s",
    2197              :                             de->d_name, PG_TBLSPC_DIR),
    2198              :                      errdetail("All directory entries in %s/ should be symbolic links.",
    2199              :                                PG_TBLSPC_DIR),
    2200              :                      errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
    2201              :     }
    2202          124 : }
    2203              : 
    2204              : /*
    2205              :  * Checks if recovery has reached a consistent state. When consistency is
    2206              :  * reached and we have a valid starting standby snapshot, tell postmaster
    2207              :  * that it can start accepting read-only connections.
    2208              :  */
    2209              : static void
    2210      2797416 : CheckRecoveryConsistency(void)
    2211              : {
    2212              :     XLogRecPtr  lastReplayedEndRecPtr;
    2213              :     TimeLineID  lastReplayedTLI;
    2214              : 
    2215              :     /*
    2216              :      * During crash recovery, we don't reach a consistent state until we've
    2217              :      * replayed all the WAL.
    2218              :      */
    2219      2797416 :     if (!XLogRecPtrIsValid(minRecoveryPoint))
    2220       258601 :         return;
    2221              : 
    2222              :     Assert(InArchiveRecovery);
    2223              : 
    2224              :     /*
    2225              :      * assume that we are called in the startup process, and hence don't need
    2226              :      * a lock to read lastReplayedEndRecPtr
    2227              :      */
    2228      2538815 :     lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
    2229      2538815 :     lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
    2230              : 
    2231              :     /*
    2232              :      * Have we reached the point where our base backup was completed?
    2233              :      */
    2234      2538815 :     if (XLogRecPtrIsValid(backupEndPoint) &&
    2235          114 :         backupEndPoint <= lastReplayedEndRecPtr)
    2236              :     {
    2237           81 :         XLogRecPtr  saveBackupStartPoint = backupStartPoint;
    2238           81 :         XLogRecPtr  saveBackupEndPoint = backupEndPoint;
    2239              : 
    2240           81 :         elog(DEBUG1, "end of backup reached");
    2241              : 
    2242              :         /*
    2243              :          * We have reached the end of base backup, as indicated by pg_control.
    2244              :          * Update the control file accordingly.
    2245              :          */
    2246           81 :         ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
    2247           81 :         backupStartPoint = InvalidXLogRecPtr;
    2248           81 :         backupEndPoint = InvalidXLogRecPtr;
    2249           81 :         backupEndRequired = false;
    2250              : 
    2251           81 :         ereport(LOG,
    2252              :                 errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
    2253              :                        LSN_FORMAT_ARGS(saveBackupStartPoint),
    2254              :                        LSN_FORMAT_ARGS(saveBackupEndPoint)));
    2255              :     }
    2256              : 
    2257              :     /*
    2258              :      * Have we passed our safe starting point? Note that minRecoveryPoint is
    2259              :      * known to be incorrectly set if recovering from a backup, until the
    2260              :      * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
    2261              :      * All we know prior to that is that we're not consistent yet.
    2262              :      */
    2263      2538815 :     if (!reachedConsistency && !backupEndRequired &&
    2264         7692 :         minRecoveryPoint <= lastReplayedEndRecPtr)
    2265              :     {
    2266              :         /*
    2267              :          * Check to see if the XLOG sequence contained any unresolved
    2268              :          * references to uninitialized pages.
    2269              :          */
    2270          124 :         XLogCheckInvalidPages();
    2271              : 
    2272              :         /*
    2273              :          * Check that pg_tblspc doesn't contain any real directories. Replay
    2274              :          * of Database/CREATE_* records may have created fictitious tablespace
    2275              :          * directories that should have been removed by the time consistency
    2276              :          * was reached.
    2277              :          */
    2278          124 :         CheckTablespaceDirectory();
    2279              : 
    2280          124 :         reachedConsistency = true;
    2281          124 :         SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
    2282          124 :         ereport(LOG,
    2283              :                 errmsg("consistent recovery state reached at %X/%08X",
    2284              :                        LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
    2285              :     }
    2286              : 
    2287              :     /*
    2288              :      * Have we got a valid starting snapshot that will allow queries to be
    2289              :      * run? If so, we can tell postmaster that the database is consistent now,
    2290              :      * enabling connections.
    2291              :      */
    2292      2538815 :     if (standbyState == STANDBY_SNAPSHOT_READY &&
    2293      2538569 :         !LocalHotStandbyActive &&
    2294          115 :         reachedConsistency &&
    2295              :         IsUnderPostmaster)
    2296              :     {
    2297          115 :         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    2298          115 :         XLogRecoveryCtl->SharedHotStandbyActive = true;
    2299          115 :         SpinLockRelease(&XLogRecoveryCtl->info_lck);
    2300              : 
    2301          115 :         LocalHotStandbyActive = true;
    2302              : 
    2303          115 :         SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
    2304              :     }
    2305              : }
    2306              : 
    2307              : /*
    2308              :  * Error context callback for errors occurring during rm_redo().
    2309              :  */
    2310              : static void
    2311          156 : rm_redo_error_callback(void *arg)
    2312              : {
    2313          156 :     XLogReaderState *record = (XLogReaderState *) arg;
    2314              :     StringInfoData buf;
    2315              : 
    2316          156 :     initStringInfo(&buf);
    2317          156 :     xlog_outdesc(&buf, record);
    2318          156 :     xlog_block_info(&buf, record);
    2319              : 
    2320              :     /* translator: %s is a WAL record description */
    2321          156 :     errcontext("WAL redo at %X/%08X for %s",
    2322          156 :                LSN_FORMAT_ARGS(record->ReadRecPtr),
    2323              :                buf.data);
    2324              : 
    2325          156 :     pfree(buf.data);
    2326          156 : }
    2327              : 
    2328              : /*
    2329              :  * Returns a string describing an XLogRecord, consisting of its identity
    2330              :  * optionally followed by a colon, a space, and a further description.
    2331              :  */
    2332              : void
    2333          156 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
    2334              : {
    2335          156 :     RmgrData    rmgr = GetRmgr(XLogRecGetRmid(record));
    2336          156 :     uint8       info = XLogRecGetInfo(record);
    2337              :     const char *id;
    2338              : 
    2339          156 :     appendStringInfoString(buf, rmgr.rm_name);
    2340          156 :     appendStringInfoChar(buf, '/');
    2341              : 
    2342          156 :     id = rmgr.rm_identify(info);
    2343          156 :     if (id == NULL)
    2344            0 :         appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
    2345              :     else
    2346          156 :         appendStringInfo(buf, "%s: ", id);
    2347              : 
    2348          156 :     rmgr.rm_desc(buf, record);
    2349          156 : }
    2350              : 
    2351              : #ifdef WAL_DEBUG
    2352              : 
    2353              : static void
    2354              : xlog_outrec(StringInfo buf, XLogReaderState *record)
    2355              : {
    2356              :     appendStringInfo(buf, "prev %X/%08X; xid %u",
    2357              :                      LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
    2358              :                      XLogRecGetXid(record));
    2359              : 
    2360              :     appendStringInfo(buf, "; len %u",
    2361              :                      XLogRecGetDataLen(record));
    2362              : 
    2363              :     xlog_block_info(buf, record);
    2364              : }
    2365              : #endif                          /* WAL_DEBUG */
    2366              : 
    2367              : /*
    2368              :  * Returns a string giving information about all the blocks in an
    2369              :  * XLogRecord.
    2370              :  */
    2371              : static void
    2372          156 : xlog_block_info(StringInfo buf, XLogReaderState *record)
    2373              : {
    2374              :     int         block_id;
    2375              : 
    2376              :     /* decode block references */
    2377          209 :     for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
    2378              :     {
    2379              :         RelFileLocator rlocator;
    2380              :         ForkNumber  forknum;
    2381              :         BlockNumber blk;
    2382              : 
    2383           53 :         if (!XLogRecGetBlockTagExtended(record, block_id,
    2384              :                                         &rlocator, &forknum, &blk, NULL))
    2385            0 :             continue;
    2386              : 
    2387           53 :         if (forknum != MAIN_FORKNUM)
    2388            5 :             appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
    2389              :                              block_id,
    2390              :                              rlocator.spcOid, rlocator.dbOid,
    2391              :                              rlocator.relNumber,
    2392              :                              forknum,
    2393              :                              blk);
    2394              :         else
    2395           48 :             appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
    2396              :                              block_id,
    2397              :                              rlocator.spcOid, rlocator.dbOid,
    2398              :                              rlocator.relNumber,
    2399              :                              blk);
    2400           53 :         if (XLogRecHasBlockImage(record, block_id))
    2401           34 :             appendStringInfoString(buf, " FPW");
    2402              :     }
    2403          156 : }
    2404              : 
    2405              : 
    2406              : /*
    2407              :  * Check that it's OK to switch to new timeline during recovery.
    2408              :  *
    2409              :  * 'lsn' is the address of the shutdown checkpoint record we're about to
    2410              :  * replay. (Currently, timeline can only change at a shutdown checkpoint).
    2411              :  */
    2412              : static void
    2413           12 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
    2414              :                     TimeLineID replayTLI)
    2415              : {
    2416              :     /* Check that the record agrees on what the current (old) timeline is */
    2417           12 :     if (prevTLI != replayTLI)
    2418            0 :         ereport(PANIC,
    2419              :                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
    2420              :                         prevTLI, replayTLI)));
    2421              : 
    2422              :     /*
    2423              :      * The new timeline better be in the list of timelines we expect to see,
    2424              :      * according to the timeline history. It should also not decrease.
    2425              :      */
    2426           12 :     if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
    2427            0 :         ereport(PANIC,
    2428              :                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
    2429              :                         newTLI, replayTLI)));
    2430              : 
    2431              :     /*
    2432              :      * If we have not yet reached min recovery point, and we're about to
    2433              :      * switch to a timeline greater than the timeline of the min recovery
    2434              :      * point: trouble. After switching to the new timeline, we could not
    2435              :      * possibly visit the min recovery point on the correct timeline anymore.
    2436              :      * This can happen if there is a newer timeline in the archive that
    2437              :      * branched before the timeline the min recovery point is on, and you
    2438              :      * attempt to do PITR to the new timeline.
    2439              :      */
    2440           12 :     if (XLogRecPtrIsValid(minRecoveryPoint) &&
    2441           10 :         lsn < minRecoveryPoint &&
    2442            1 :         newTLI > minRecoveryPointTLI)
    2443            0 :         ereport(PANIC,
    2444              :                 errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
    2445              :                        newTLI,
    2446              :                        LSN_FORMAT_ARGS(minRecoveryPoint),
    2447              :                        minRecoveryPointTLI));
    2448              : 
    2449              :     /* Looks good */
    2450           12 : }
    2451              : 
    2452              : 
    2453              : /*
    2454              :  * Extract timestamp from WAL record.
    2455              :  *
    2456              :  * If the record contains a timestamp, returns true, and saves the timestamp
    2457              :  * in *recordXtime. If the record type has no timestamp, returns false.
    2458              :  * Currently, only transaction commit/abort records and restore points contain
    2459              :  * timestamps.
    2460              :  */
    2461              : static bool
    2462        44829 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
    2463              : {
    2464        44829 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    2465        44829 :     uint8       xact_info = info & XLOG_XACT_OPMASK;
    2466        44829 :     uint8       rmid = XLogRecGetRmid(record);
    2467              : 
    2468        44829 :     if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
    2469              :     {
    2470            2 :         *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
    2471            2 :         return true;
    2472              :     }
    2473        44827 :     if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
    2474              :                                xact_info == XLOG_XACT_COMMIT_PREPARED))
    2475              :     {
    2476        41085 :         *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
    2477        41085 :         return true;
    2478              :     }
    2479         3742 :     if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
    2480              :                                xact_info == XLOG_XACT_ABORT_PREPARED))
    2481              :     {
    2482         3742 :         *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
    2483         3742 :         return true;
    2484              :     }
    2485            0 :     return false;
    2486              : }
    2487              : 
    2488              : /*
    2489              :  * Checks whether the current buffer page and backup page stored in the
    2490              :  * WAL record are consistent or not. Before comparing the two pages, a
    2491              :  * masking can be applied to the pages to ignore certain areas like hint bits,
    2492              :  * unused space between pd_lower and pd_upper among other things. This
    2493              :  * function should be called once WAL replay has been completed for a
    2494              :  * given record.
    2495              :  */
    2496              : static void
    2497      2211560 : verifyBackupPageConsistency(XLogReaderState *record)
    2498              : {
    2499      2211560 :     RmgrData    rmgr = GetRmgr(XLogRecGetRmid(record));
    2500              :     RelFileLocator rlocator;
    2501              :     ForkNumber  forknum;
    2502              :     BlockNumber blkno;
    2503              :     int         block_id;
    2504              : 
    2505              :     /* Records with no backup blocks have no need for consistency checks. */
    2506      2211560 :     if (!XLogRecHasAnyBlockRefs(record))
    2507           79 :         return;
    2508              : 
    2509              :     Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
    2510              : 
    2511      4593138 :     for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
    2512              :     {
    2513              :         Buffer      buf;
    2514              :         Page        page;
    2515              : 
    2516      2381657 :         if (!XLogRecGetBlockTagExtended(record, block_id,
    2517              :                                         &rlocator, &forknum, &blkno, NULL))
    2518              :         {
    2519              :             /*
    2520              :              * WAL record doesn't contain a block reference with the given id.
    2521              :              * Do nothing.
    2522              :              */
    2523         2102 :             continue;
    2524              :         }
    2525              : 
    2526              :         Assert(XLogRecHasBlockImage(record, block_id));
    2527              : 
    2528      2379555 :         if (XLogRecBlockImageApply(record, block_id))
    2529              :         {
    2530              :             /*
    2531              :              * WAL record has already applied the page, so bypass the
    2532              :              * consistency check as that would result in comparing the full
    2533              :              * page stored in the record with itself.
    2534              :              */
    2535        27512 :             continue;
    2536              :         }
    2537              : 
    2538              :         /*
    2539              :          * Read the contents from the current buffer and store it in a
    2540              :          * temporary page.
    2541              :          */
    2542      2352043 :         buf = XLogReadBufferExtended(rlocator, forknum, blkno,
    2543              :                                      RBM_NORMAL_NO_LOG,
    2544              :                                      InvalidBuffer);
    2545      2352043 :         if (!BufferIsValid(buf))
    2546            0 :             continue;
    2547              : 
    2548      2352043 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
    2549      2352043 :         page = BufferGetPage(buf);
    2550              : 
    2551              :         /*
    2552              :          * Take a copy of the local page where WAL has been applied to have a
    2553              :          * comparison base before masking it...
    2554              :          */
    2555      2352043 :         memcpy(replay_image_masked, page, BLCKSZ);
    2556              : 
    2557              :         /* No need for this page anymore now that a copy is in. */
    2558      2352043 :         UnlockReleaseBuffer(buf);
    2559              : 
    2560              :         /*
    2561              :          * If the block LSN is already ahead of this WAL record, we can't
    2562              :          * expect contents to match.  This can happen if recovery is
    2563              :          * restarted.
    2564              :          */
    2565      2352043 :         if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
    2566            0 :             continue;
    2567              : 
    2568              :         /*
    2569              :          * Read the contents from the backup copy, stored in WAL record and
    2570              :          * store it in a temporary page. There is no need to allocate a new
    2571              :          * page here, a local buffer is fine to hold its contents and a mask
    2572              :          * can be directly applied on it.
    2573              :          */
    2574      2352043 :         if (!RestoreBlockImage(record, block_id, primary_image_masked))
    2575            0 :             ereport(ERROR,
    2576              :                     (errcode(ERRCODE_INTERNAL_ERROR),
    2577              :                      errmsg_internal("%s", record->errormsg_buf)));
    2578              : 
    2579              :         /*
    2580              :          * If masking function is defined, mask both the primary and replay
    2581              :          * images
    2582              :          */
    2583      2352043 :         if (rmgr.rm_mask != NULL)
    2584              :         {
    2585      2352043 :             rmgr.rm_mask(replay_image_masked, blkno);
    2586      2352043 :             rmgr.rm_mask(primary_image_masked, blkno);
    2587              :         }
    2588              : 
    2589              :         /* Time to compare the primary and replay images. */
    2590      2352043 :         if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
    2591              :         {
    2592            0 :             elog(FATAL,
    2593              :                  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
    2594              :                  rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
    2595              :                  forknum, blkno);
    2596              :         }
    2597              :     }
    2598              : }
    2599              : 
    2600              : /*
    2601              :  * For point-in-time recovery, this function decides whether we want to
    2602              :  * stop applying the XLOG before the current record.
    2603              :  *
    2604              :  * Returns true if we are stopping, false otherwise. If stopping, some
    2605              :  * information is saved in recoveryStopXid et al for use in annotating the
    2606              :  * new timeline's history file.
    2607              :  */
    2608              : static bool
    2609      2797193 : recoveryStopsBefore(XLogReaderState *record)
    2610              : {
    2611      2797193 :     bool        stopsHere = false;
    2612              :     uint8       xact_info;
    2613              :     bool        isCommit;
    2614      2797193 :     TimestampTz recordXtime = 0;
    2615              :     TransactionId recordXid;
    2616              : 
    2617              :     /*
    2618              :      * Ignore recovery target settings when not in archive recovery (meaning
    2619              :      * we are in crash recovery).
    2620              :      */
    2621      2797193 :     if (!ArchiveRecoveryRequested)
    2622       243894 :         return false;
    2623              : 
    2624              :     /* Check if we should stop as soon as reaching consistency */
    2625      2553299 :     if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
    2626              :     {
    2627            0 :         ereport(LOG,
    2628              :                 (errmsg("recovery stopping after reaching consistency")));
    2629              : 
    2630            0 :         recoveryStopAfter = false;
    2631            0 :         recoveryStopXid = InvalidTransactionId;
    2632            0 :         recoveryStopLSN = InvalidXLogRecPtr;
    2633            0 :         recoveryStopTime = 0;
    2634            0 :         recoveryStopName[0] = '\0';
    2635            0 :         return true;
    2636              :     }
    2637              : 
    2638              :     /* Check if target LSN has been reached */
    2639      2553299 :     if (recoveryTarget == RECOVERY_TARGET_LSN &&
    2640         8512 :         !recoveryTargetInclusive &&
    2641          482 :         record->ReadRecPtr >= recoveryTargetLSN)
    2642              :     {
    2643            2 :         recoveryStopAfter = false;
    2644            2 :         recoveryStopXid = InvalidTransactionId;
    2645            2 :         recoveryStopLSN = record->ReadRecPtr;
    2646            2 :         recoveryStopTime = 0;
    2647            2 :         recoveryStopName[0] = '\0';
    2648            2 :         ereport(LOG,
    2649              :                 errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
    2650              :                        LSN_FORMAT_ARGS(recoveryStopLSN)));
    2651            2 :         return true;
    2652              :     }
    2653              : 
    2654              :     /* Otherwise we only consider stopping before COMMIT or ABORT records. */
    2655      2553297 :     if (XLogRecGetRmid(record) != RM_XACT_ID)
    2656      2530594 :         return false;
    2657              : 
    2658        22703 :     xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
    2659              : 
    2660        22703 :     if (xact_info == XLOG_XACT_COMMIT)
    2661              :     {
    2662        20500 :         isCommit = true;
    2663        20500 :         recordXid = XLogRecGetXid(record);
    2664              :     }
    2665         2203 :     else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
    2666              :     {
    2667           29 :         xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
    2668              :         xl_xact_parsed_commit parsed;
    2669              : 
    2670           29 :         isCommit = true;
    2671           29 :         ParseCommitRecord(XLogRecGetInfo(record),
    2672              :                           xlrec,
    2673              :                           &parsed);
    2674           29 :         recordXid = parsed.twophase_xid;
    2675              :     }
    2676         2174 :     else if (xact_info == XLOG_XACT_ABORT)
    2677              :     {
    2678         1858 :         isCommit = false;
    2679         1858 :         recordXid = XLogRecGetXid(record);
    2680              :     }
    2681          316 :     else if (xact_info == XLOG_XACT_ABORT_PREPARED)
    2682              :     {
    2683           13 :         xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
    2684              :         xl_xact_parsed_abort parsed;
    2685              : 
    2686           13 :         isCommit = false;
    2687           13 :         ParseAbortRecord(XLogRecGetInfo(record),
    2688              :                          xlrec,
    2689              :                          &parsed);
    2690           13 :         recordXid = parsed.twophase_xid;
    2691              :     }
    2692              :     else
    2693          303 :         return false;
    2694              : 
    2695        22400 :     if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
    2696              :     {
    2697              :         /*
    2698              :          * There can be only one transaction end record with this exact
    2699              :          * transactionid
    2700              :          *
    2701              :          * when testing for an xid, we MUST test for equality only, since
    2702              :          * transactions are numbered in the order they start, not the order
    2703              :          * they complete. A higher numbered xid will complete before you about
    2704              :          * 50% of the time...
    2705              :          */
    2706            0 :         stopsHere = (recordXid == recoveryTargetXid);
    2707              :     }
    2708              : 
    2709              :     /*
    2710              :      * Note: we must fetch recordXtime regardless of recoveryTarget setting.
    2711              :      * We don't expect getRecordTimestamp ever to fail, since we already know
    2712              :      * this is a commit or abort record; but test its result anyway.
    2713              :      */
    2714        22400 :     if (getRecordTimestamp(record, &recordXtime) &&
    2715        22400 :         recoveryTarget == RECOVERY_TARGET_TIME)
    2716              :     {
    2717              :         /*
    2718              :          * There can be many transactions that share the same commit time, so
    2719              :          * we stop after the last one, if we are inclusive, or stop at the
    2720              :          * first one if we are exclusive
    2721              :          */
    2722            0 :         if (recoveryTargetInclusive)
    2723            0 :             stopsHere = (recordXtime > recoveryTargetTime);
    2724              :         else
    2725            0 :             stopsHere = (recordXtime >= recoveryTargetTime);
    2726              :     }
    2727              : 
    2728        22400 :     if (stopsHere)
    2729              :     {
    2730            0 :         recoveryStopAfter = false;
    2731            0 :         recoveryStopXid = recordXid;
    2732            0 :         recoveryStopTime = recordXtime;
    2733            0 :         recoveryStopLSN = InvalidXLogRecPtr;
    2734            0 :         recoveryStopName[0] = '\0';
    2735              : 
    2736            0 :         if (isCommit)
    2737              :         {
    2738            0 :             ereport(LOG,
    2739              :                     (errmsg("recovery stopping before commit of transaction %u, time %s",
    2740              :                             recoveryStopXid,
    2741              :                             timestamptz_to_str(recoveryStopTime))));
    2742              :         }
    2743              :         else
    2744              :         {
    2745            0 :             ereport(LOG,
    2746              :                     (errmsg("recovery stopping before abort of transaction %u, time %s",
    2747              :                             recoveryStopXid,
    2748              :                             timestamptz_to_str(recoveryStopTime))));
    2749              :         }
    2750              :     }
    2751              : 
    2752        22400 :     return stopsHere;
    2753              : }
    2754              : 
    2755              : /*
    2756              :  * Same as recoveryStopsBefore, but called after applying the record.
    2757              :  *
    2758              :  * We also track the timestamp of the latest applied COMMIT/ABORT
    2759              :  * record in XLogRecoveryCtl->recoveryLastXTime.
    2760              :  */
    2761              : static bool
    2762      2797189 : recoveryStopsAfter(XLogReaderState *record)
    2763              : {
    2764              :     uint8       info;
    2765              :     uint8       xact_info;
    2766              :     uint8       rmid;
    2767      2797189 :     TimestampTz recordXtime = 0;
    2768              : 
    2769              :     /*
    2770              :      * Ignore recovery target settings when not in archive recovery (meaning
    2771              :      * we are in crash recovery).
    2772              :      */
    2773      2797189 :     if (!ArchiveRecoveryRequested)
    2774       243894 :         return false;
    2775              : 
    2776      2553295 :     info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    2777      2553295 :     rmid = XLogRecGetRmid(record);
    2778              : 
    2779              :     /*
    2780              :      * There can be many restore points that share the same name; we stop at
    2781              :      * the first one.
    2782              :      */
    2783      2553295 :     if (recoveryTarget == RECOVERY_TARGET_NAME &&
    2784           20 :         rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
    2785              :     {
    2786              :         xl_restore_point *recordRestorePointData;
    2787              : 
    2788            3 :         recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
    2789              : 
    2790            3 :         if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
    2791              :         {
    2792            2 :             recoveryStopAfter = true;
    2793            2 :             recoveryStopXid = InvalidTransactionId;
    2794            2 :             recoveryStopLSN = InvalidXLogRecPtr;
    2795            2 :             (void) getRecordTimestamp(record, &recoveryStopTime);
    2796            2 :             strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
    2797              : 
    2798            2 :             ereport(LOG,
    2799              :                     (errmsg("recovery stopping at restore point \"%s\", time %s",
    2800              :                             recoveryStopName,
    2801              :                             timestamptz_to_str(recoveryStopTime))));
    2802            2 :             return true;
    2803              :         }
    2804              :     }
    2805              : 
    2806              :     /* Check if the target LSN has been reached */
    2807      2553293 :     if (recoveryTarget == RECOVERY_TARGET_LSN &&
    2808         8030 :         recoveryTargetInclusive &&
    2809         8030 :         record->ReadRecPtr >= recoveryTargetLSN)
    2810              :     {
    2811            3 :         recoveryStopAfter = true;
    2812            3 :         recoveryStopXid = InvalidTransactionId;
    2813            3 :         recoveryStopLSN = record->ReadRecPtr;
    2814            3 :         recoveryStopTime = 0;
    2815            3 :         recoveryStopName[0] = '\0';
    2816            3 :         ereport(LOG,
    2817              :                 errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
    2818              :                        LSN_FORMAT_ARGS(recoveryStopLSN)));
    2819            3 :         return true;
    2820              :     }
    2821              : 
    2822      2553290 :     if (rmid != RM_XACT_ID)
    2823      2530589 :         return false;
    2824              : 
    2825        22701 :     xact_info = info & XLOG_XACT_OPMASK;
    2826              : 
    2827        22701 :     if (xact_info == XLOG_XACT_COMMIT ||
    2828         2174 :         xact_info == XLOG_XACT_COMMIT_PREPARED ||
    2829          316 :         xact_info == XLOG_XACT_ABORT ||
    2830              :         xact_info == XLOG_XACT_ABORT_PREPARED)
    2831              :     {
    2832              :         TransactionId recordXid;
    2833              : 
    2834              :         /* Update the last applied transaction timestamp */
    2835        22398 :         if (getRecordTimestamp(record, &recordXtime))
    2836        22398 :             SetLatestXTime(recordXtime);
    2837              : 
    2838              :         /* Extract the XID of the committed/aborted transaction */
    2839        22398 :         if (xact_info == XLOG_XACT_COMMIT_PREPARED)
    2840              :         {
    2841           29 :             xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
    2842              :             xl_xact_parsed_commit parsed;
    2843              : 
    2844           29 :             ParseCommitRecord(XLogRecGetInfo(record),
    2845              :                               xlrec,
    2846              :                               &parsed);
    2847           29 :             recordXid = parsed.twophase_xid;
    2848              :         }
    2849        22369 :         else if (xact_info == XLOG_XACT_ABORT_PREPARED)
    2850              :         {
    2851           13 :             xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
    2852              :             xl_xact_parsed_abort parsed;
    2853              : 
    2854           13 :             ParseAbortRecord(XLogRecGetInfo(record),
    2855              :                              xlrec,
    2856              :                              &parsed);
    2857           13 :             recordXid = parsed.twophase_xid;
    2858              :         }
    2859              :         else
    2860        22356 :             recordXid = XLogRecGetXid(record);
    2861              : 
    2862              :         /*
    2863              :          * There can be only one transaction end record with this exact
    2864              :          * transactionid
    2865              :          *
    2866              :          * when testing for an xid, we MUST test for equality only, since
    2867              :          * transactions are numbered in the order they start, not the order
    2868              :          * they complete. A higher numbered xid will complete before you about
    2869              :          * 50% of the time...
    2870              :          */
    2871        22398 :         if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
    2872            0 :             recordXid == recoveryTargetXid)
    2873              :         {
    2874            0 :             recoveryStopAfter = true;
    2875            0 :             recoveryStopXid = recordXid;
    2876            0 :             recoveryStopTime = recordXtime;
    2877            0 :             recoveryStopLSN = InvalidXLogRecPtr;
    2878            0 :             recoveryStopName[0] = '\0';
    2879              : 
    2880            0 :             if (xact_info == XLOG_XACT_COMMIT ||
    2881              :                 xact_info == XLOG_XACT_COMMIT_PREPARED)
    2882              :             {
    2883            0 :                 ereport(LOG,
    2884              :                         (errmsg("recovery stopping after commit of transaction %u, time %s",
    2885              :                                 recoveryStopXid,
    2886              :                                 timestamptz_to_str(recoveryStopTime))));
    2887              :             }
    2888            0 :             else if (xact_info == XLOG_XACT_ABORT ||
    2889              :                      xact_info == XLOG_XACT_ABORT_PREPARED)
    2890              :             {
    2891            0 :                 ereport(LOG,
    2892              :                         (errmsg("recovery stopping after abort of transaction %u, time %s",
    2893              :                                 recoveryStopXid,
    2894              :                                 timestamptz_to_str(recoveryStopTime))));
    2895              :             }
    2896            0 :             return true;
    2897              :         }
    2898              :     }
    2899              : 
    2900              :     /* Check if we should stop as soon as reaching consistency */
    2901        22701 :     if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
    2902              :     {
    2903            0 :         ereport(LOG,
    2904              :                 (errmsg("recovery stopping after reaching consistency")));
    2905              : 
    2906            0 :         recoveryStopAfter = true;
    2907            0 :         recoveryStopXid = InvalidTransactionId;
    2908            0 :         recoveryStopTime = 0;
    2909            0 :         recoveryStopLSN = InvalidXLogRecPtr;
    2910            0 :         recoveryStopName[0] = '\0';
    2911            0 :         return true;
    2912              :     }
    2913              : 
    2914        22701 :     return false;
    2915              : }
    2916              : 
    2917              : /*
    2918              :  * Create a comment for the history file to explain why and where
    2919              :  * timeline changed.
    2920              :  */
    2921              : static char *
    2922          941 : getRecoveryStopReason(void)
    2923              : {
    2924              :     char        reason[200];
    2925              : 
    2926          941 :     if (recoveryTarget == RECOVERY_TARGET_XID)
    2927            0 :         snprintf(reason, sizeof(reason),
    2928              :                  "%s transaction %u",
    2929            0 :                  recoveryStopAfter ? "after" : "before",
    2930              :                  recoveryStopXid);
    2931          941 :     else if (recoveryTarget == RECOVERY_TARGET_TIME)
    2932            0 :         snprintf(reason, sizeof(reason),
    2933              :                  "%s %s\n",
    2934            0 :                  recoveryStopAfter ? "after" : "before",
    2935              :                  timestamptz_to_str(recoveryStopTime));
    2936          941 :     else if (recoveryTarget == RECOVERY_TARGET_LSN)
    2937            6 :         snprintf(reason, sizeof(reason),
    2938              :                  "%s LSN %X/%08X\n",
    2939            6 :                  recoveryStopAfter ? "after" : "before",
    2940            6 :                  LSN_FORMAT_ARGS(recoveryStopLSN));
    2941          935 :     else if (recoveryTarget == RECOVERY_TARGET_NAME)
    2942            3 :         snprintf(reason, sizeof(reason),
    2943              :                  "at restore point \"%s\"",
    2944              :                  recoveryStopName);
    2945          932 :     else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
    2946            0 :         snprintf(reason, sizeof(reason), "reached consistency");
    2947              :     else
    2948          932 :         snprintf(reason, sizeof(reason), "no recovery target specified");
    2949              : 
    2950          941 :     return pstrdup(reason);
    2951              : }
    2952              : 
    2953              : /*
    2954              :  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
    2955              :  *
    2956              :  * endOfRecovery is true if the recovery target is reached and
    2957              :  * the paused state starts at the end of recovery because of
    2958              :  * recovery_target_action=pause, and false otherwise.
    2959              :  */
    2960              : static void
    2961            4 : recoveryPausesHere(bool endOfRecovery)
    2962              : {
    2963              :     /* Don't pause unless users can connect! */
    2964            4 :     if (!LocalHotStandbyActive)
    2965            0 :         return;
    2966              : 
    2967              :     /* Don't pause after standby promotion has been triggered */
    2968            4 :     if (LocalPromoteIsTriggered)
    2969            0 :         return;
    2970              : 
    2971            4 :     if (endOfRecovery)
    2972            1 :         ereport(LOG,
    2973              :                 (errmsg("pausing at the end of recovery"),
    2974              :                  errhint("Execute pg_wal_replay_resume() to promote.")));
    2975              :     else
    2976            3 :         ereport(LOG,
    2977              :                 (errmsg("recovery has paused"),
    2978              :                  errhint("Execute pg_wal_replay_resume() to continue.")));
    2979              : 
    2980              :     /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
    2981           12 :     while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
    2982              :     {
    2983           10 :         ProcessStartupProcInterrupts();
    2984           10 :         if (CheckForStandbyTrigger())
    2985            2 :             return;
    2986              : 
    2987              :         /*
    2988              :          * If recovery pause is requested then set it paused.  While we are in
    2989              :          * the loop, user might resume and pause again so set this every time.
    2990              :          */
    2991            8 :         ConfirmRecoveryPaused();
    2992              : 
    2993              :         /*
    2994              :          * We wait on a condition variable that will wake us as soon as the
    2995              :          * pause ends, but we use a timeout so we can check the above exit
    2996              :          * condition periodically too.
    2997              :          */
    2998            8 :         ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
    2999              :                                     WAIT_EVENT_RECOVERY_PAUSE);
    3000              :     }
    3001            2 :     ConditionVariableCancelSleep();
    3002              : }
    3003              : 
    3004              : /*
    3005              :  * When recovery_min_apply_delay is set, we wait long enough to make sure
    3006              :  * certain record types are applied at least that interval behind the primary.
    3007              :  *
    3008              :  * Returns true if we waited.
    3009              :  *
    3010              :  * Note that the delay is calculated between the WAL record log time and
    3011              :  * the current time on standby. We would prefer to keep track of when this
    3012              :  * standby received each WAL record, which would allow a more consistent
    3013              :  * approach and one not affected by time synchronisation issues, but that
    3014              :  * is significantly more effort and complexity for little actual gain in
    3015              :  * usability.
    3016              :  */
    3017              : static bool
    3018      2797191 : recoveryApplyDelay(XLogReaderState *record)
    3019              : {
    3020              :     uint8       xact_info;
    3021              :     TimestampTz xtime;
    3022              :     TimestampTz delayUntil;
    3023              :     long        msecs;
    3024              : 
    3025              :     /* nothing to do if no delay configured */
    3026      2797191 :     if (recovery_min_apply_delay <= 0)
    3027      2797049 :         return false;
    3028              : 
    3029              :     /* no delay is applied on a database not yet consistent */
    3030          142 :     if (!reachedConsistency)
    3031            4 :         return false;
    3032              : 
    3033              :     /* nothing to do if crash recovery is requested */
    3034          138 :     if (!ArchiveRecoveryRequested)
    3035            0 :         return false;
    3036              : 
    3037              :     /*
    3038              :      * Is it a COMMIT record?
    3039              :      *
    3040              :      * We deliberately choose not to delay aborts since they have no effect on
    3041              :      * MVCC. We already allow replay of records that don't have a timestamp,
    3042              :      * so there is already opportunity for issues caused by early conflicts on
    3043              :      * standbys.
    3044              :      */
    3045          138 :     if (XLogRecGetRmid(record) != RM_XACT_ID)
    3046          109 :         return false;
    3047              : 
    3048           29 :     xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
    3049              : 
    3050           29 :     if (xact_info != XLOG_XACT_COMMIT &&
    3051              :         xact_info != XLOG_XACT_COMMIT_PREPARED)
    3052            0 :         return false;
    3053              : 
    3054           29 :     if (!getRecordTimestamp(record, &xtime))
    3055            0 :         return false;
    3056              : 
    3057           29 :     delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
    3058              : 
    3059              :     /*
    3060              :      * Exit without arming the latch if it's already past time to apply this
    3061              :      * record
    3062              :      */
    3063           29 :     msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
    3064           29 :     if (msecs <= 0)
    3065            1 :         return false;
    3066              : 
    3067              :     while (true)
    3068              :     {
    3069           75 :         ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
    3070              : 
    3071              :         /* This might change recovery_min_apply_delay. */
    3072           75 :         ProcessStartupProcInterrupts();
    3073              : 
    3074           75 :         if (CheckForStandbyTrigger())
    3075            0 :             break;
    3076              : 
    3077              :         /*
    3078              :          * Recalculate delayUntil as recovery_min_apply_delay could have
    3079              :          * changed while waiting in this loop.
    3080              :          */
    3081           75 :         delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
    3082              : 
    3083              :         /*
    3084              :          * Wait for difference between GetCurrentTimestamp() and delayUntil.
    3085              :          */
    3086           75 :         msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
    3087              :                                                 delayUntil);
    3088              : 
    3089           75 :         if (msecs <= 0)
    3090           28 :             break;
    3091              : 
    3092           47 :         elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
    3093              : 
    3094           47 :         (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
    3095              :                          WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
    3096              :                          msecs,
    3097              :                          WAIT_EVENT_RECOVERY_APPLY_DELAY);
    3098              :     }
    3099           28 :     return true;
    3100              : }
    3101              : 
    3102              : /*
    3103              :  * Get the current state of the recovery pause request.
    3104              :  */
    3105              : RecoveryPauseState
    3106           18 : GetRecoveryPauseState(void)
    3107              : {
    3108              :     RecoveryPauseState state;
    3109              : 
    3110           18 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    3111           18 :     state = XLogRecoveryCtl->recoveryPauseState;
    3112           18 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    3113              : 
    3114           18 :     return state;
    3115              : }
    3116              : 
    3117              : /*
    3118              :  * Set the recovery pause state.
    3119              :  *
    3120              :  * If recovery pause is requested then sets the recovery pause state to
    3121              :  * 'pause requested' if it is not already 'paused'.  Otherwise, sets it
    3122              :  * to 'not paused' to resume the recovery.  The recovery pause will be
    3123              :  * confirmed by the ConfirmRecoveryPaused.
    3124              :  */
    3125              : void
    3126           56 : SetRecoveryPause(bool recoveryPause)
    3127              : {
    3128           56 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    3129              : 
    3130           56 :     if (!recoveryPause)
    3131           51 :         XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
    3132            5 :     else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
    3133            5 :         XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
    3134              : 
    3135           56 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    3136              : 
    3137           56 :     if (!recoveryPause)
    3138           51 :         ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
    3139           56 : }
    3140              : 
    3141              : /*
    3142              :  * Confirm the recovery pause by setting the recovery pause state to
    3143              :  * RECOVERY_PAUSED.
    3144              :  */
    3145              : static void
    3146            8 : ConfirmRecoveryPaused(void)
    3147              : {
    3148              :     /* If recovery pause is requested then set it paused */
    3149            8 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    3150            8 :     if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
    3151            4 :         XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
    3152            8 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    3153            8 : }
    3154              : 
    3155              : 
    3156              : /*
    3157              :  * Attempt to read the next XLOG record.
    3158              :  *
    3159              :  * Before first call, the reader needs to be positioned to the first record
    3160              :  * by calling XLogPrefetcherBeginRead().
    3161              :  *
    3162              :  * If no valid record is available, returns NULL, or fails if emode is PANIC.
    3163              :  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
    3164              :  * record is available.
    3165              :  */
    3166              : static XLogRecord *
    3167      2799481 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
    3168              :            bool fetching_ckpt, TimeLineID replayTLI)
    3169              : {
    3170              :     XLogRecord *record;
    3171      2799481 :     XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
    3172      2799481 :     XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
    3173              : 
    3174              :     Assert(AmStartupProcess() || !IsUnderPostmaster);
    3175              : 
    3176              :     /* Pass through parameters to XLogPageRead */
    3177      2799481 :     private->fetching_ckpt = fetching_ckpt;
    3178      2799481 :     private->emode = emode;
    3179      2799481 :     private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr);
    3180      2799481 :     private->replayTLI = replayTLI;
    3181              : 
    3182              :     /* This is the first attempt to read this page. */
    3183      2799481 :     lastSourceFailed = false;
    3184              : 
    3185              :     for (;;)
    3186          138 :     {
    3187              :         char       *errormsg;
    3188              : 
    3189      2799619 :         record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
    3190      2799560 :         if (record == NULL)
    3191              :         {
    3192              :             /*
    3193              :              * When we find that WAL ends in an incomplete record, keep track
    3194              :              * of that record.  After recovery is done, we'll write a record
    3195              :              * to indicate to downstream WAL readers that that portion is to
    3196              :              * be ignored.
    3197              :              *
    3198              :              * However, when ArchiveRecoveryRequested = true, we're going to
    3199              :              * switch to a new timeline at the end of recovery. We will only
    3200              :              * copy WAL over to the new timeline up to the end of the last
    3201              :              * complete record, so if we did this, we would later create an
    3202              :              * overwrite contrecord in the wrong place, breaking everything.
    3203              :              */
    3204          296 :             if (!ArchiveRecoveryRequested &&
    3205          109 :                 XLogRecPtrIsValid(xlogreader->abortedRecPtr))
    3206              :             {
    3207           11 :                 abortedRecPtr = xlogreader->abortedRecPtr;
    3208           11 :                 missingContrecPtr = xlogreader->missingContrecPtr;
    3209              :             }
    3210              : 
    3211          296 :             if (readFile >= 0)
    3212              :             {
    3213          271 :                 close(readFile);
    3214          271 :                 readFile = -1;
    3215              :             }
    3216              : 
    3217              :             /*
    3218              :              * We only end up here without a message when XLogPageRead()
    3219              :              * failed - in that case we already logged something. In
    3220              :              * StandbyMode that only happens if we have been triggered, so we
    3221              :              * shouldn't loop anymore in that case.
    3222              :              */
    3223          296 :             if (errormsg)
    3224          271 :                 ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
    3225              :                         (errmsg_internal("%s", errormsg) /* already translated */ ));
    3226              :         }
    3227              : 
    3228              :         /*
    3229              :          * Check page TLI is one of the expected values.
    3230              :          */
    3231      2799264 :         else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
    3232              :         {
    3233              :             char        fname[MAXFNAMELEN];
    3234              :             XLogSegNo   segno;
    3235              :             int32       offset;
    3236              : 
    3237            0 :             XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
    3238            0 :             offset = XLogSegmentOffset(xlogreader->latestPagePtr,
    3239              :                                        wal_segment_size);
    3240            0 :             XLogFileName(fname, xlogreader->seg.ws_tli, segno,
    3241              :                          wal_segment_size);
    3242            0 :             ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
    3243              :                     errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
    3244              :                            xlogreader->latestPageTLI,
    3245              :                            fname,
    3246              :                            LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
    3247              :                            offset));
    3248            0 :             record = NULL;
    3249              :         }
    3250              : 
    3251      2799560 :         if (record)
    3252              :         {
    3253              :             /* Great, got a record */
    3254      2799422 :             return record;
    3255              :         }
    3256              :         else
    3257              :         {
    3258              :             /* No valid record available from this source */
    3259          296 :             lastSourceFailed = true;
    3260              : 
    3261              :             /*
    3262              :              * If archive recovery was requested, but we were still doing
    3263              :              * crash recovery, switch to archive recovery and retry using the
    3264              :              * offline archive. We have now replayed all the valid WAL in
    3265              :              * pg_wal, so we are presumably now consistent.
    3266              :              *
    3267              :              * We require that there's at least some valid WAL present in
    3268              :              * pg_wal, however (!fetching_ckpt).  We could recover using the
    3269              :              * WAL from the archive, even if pg_wal is completely empty, but
    3270              :              * we'd have no idea how far we'd have to replay to reach
    3271              :              * consistency.  So err on the safe side and give up.
    3272              :              */
    3273          296 :             if (!InArchiveRecovery && ArchiveRecoveryRequested &&
    3274            2 :                 !fetching_ckpt)
    3275              :             {
    3276            2 :                 ereport(DEBUG1,
    3277              :                         (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
    3278            2 :                 InArchiveRecovery = true;
    3279            2 :                 if (StandbyModeRequested)
    3280            2 :                     EnableStandbyMode();
    3281              : 
    3282            2 :                 SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
    3283            2 :                 minRecoveryPoint = xlogreader->EndRecPtr;
    3284            2 :                 minRecoveryPointTLI = replayTLI;
    3285              : 
    3286            2 :                 CheckRecoveryConsistency();
    3287              : 
    3288              :                 /*
    3289              :                  * Before we retry, reset lastSourceFailed and currentSource
    3290              :                  * so that we will check the archive next.
    3291              :                  */
    3292            2 :                 lastSourceFailed = false;
    3293            2 :                 currentSource = XLOG_FROM_ANY;
    3294              : 
    3295          138 :                 continue;
    3296              :             }
    3297              : 
    3298              :             /* In standby mode, loop back to retry. Otherwise, give up. */
    3299          294 :             if (StandbyMode && !CheckForStandbyTrigger())
    3300          136 :                 continue;
    3301              :             else
    3302          158 :                 return NULL;
    3303              :         }
    3304              :     }
    3305              : }
    3306              : 
    3307              : /*
    3308              :  * Read the XLOG page containing targetPagePtr into readBuf (if not read
    3309              :  * already).  Returns number of bytes read, if the page is read successfully,
    3310              :  * or XLREAD_FAIL in case of errors.  When errors occur, they are ereport'ed,
    3311              :  * but only if they have not been previously reported.
    3312              :  *
    3313              :  * See XLogReaderRoutine.page_read for more details.
    3314              :  *
    3315              :  * While prefetching, xlogreader->nonblocking may be set.  In that case,
    3316              :  * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
    3317              :  *
    3318              :  * This is responsible for restoring files from archive as needed, as well
    3319              :  * as for waiting for the requested WAL record to arrive in standby mode.
    3320              :  *
    3321              :  * xlogreader->private_data->emode specifies the log level used for reporting
    3322              :  * "file not found" or "end of WAL" situations in archive recovery, or in
    3323              :  * standby mode when promotion is triggered. If set to WARNING or below,
    3324              :  * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
    3325              :  * levels the ereport() won't return.
    3326              :  *
    3327              :  * In standby mode, if after a successful return of XLogPageRead() the
    3328              :  * caller finds the record it's interested in to be broken, it should
    3329              :  * ereport the error with the level determined by
    3330              :  * emode_for_corrupt_record(), and then set lastSourceFailed
    3331              :  * and call XLogPageRead() again with the same arguments. This lets
    3332              :  * XLogPageRead() to try fetching the record from another source, or to
    3333              :  * sleep and retry.
    3334              :  */
    3335              : static int
    3336      1454158 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
    3337              :              XLogRecPtr targetRecPtr, char *readBuf)
    3338              : {
    3339      1454158 :     XLogPageReadPrivate *private =
    3340              :         (XLogPageReadPrivate *) xlogreader->private_data;
    3341      1454158 :     int         emode = private->emode;
    3342              :     uint32      targetPageOff;
    3343              :     XLogSegNo   targetSegNo PG_USED_FOR_ASSERTS_ONLY;
    3344              :     int         r;
    3345              :     instr_time  io_start;
    3346              : 
    3347              :     Assert(AmStartupProcess() || !IsUnderPostmaster);
    3348              : 
    3349      1454158 :     XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
    3350      1454158 :     targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
    3351              : 
    3352              :     /*
    3353              :      * See if we need to switch to a new segment because the requested record
    3354              :      * is not in the currently open one.
    3355              :      */
    3356      1454158 :     if (readFile >= 0 &&
    3357      1452254 :         !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
    3358              :     {
    3359              :         /*
    3360              :          * Request a restartpoint if we've replayed too much xlog since the
    3361              :          * last one.
    3362              :          */
    3363         1634 :         if (ArchiveRecoveryRequested && IsUnderPostmaster)
    3364              :         {
    3365         1617 :             if (XLogCheckpointNeeded(readSegNo))
    3366              :             {
    3367         1495 :                 (void) GetRedoRecPtr();
    3368         1495 :                 if (XLogCheckpointNeeded(readSegNo))
    3369         1488 :                     RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
    3370              :             }
    3371              :         }
    3372              : 
    3373         1634 :         close(readFile);
    3374         1634 :         readFile = -1;
    3375         1634 :         readSource = XLOG_FROM_ANY;
    3376              :     }
    3377              : 
    3378      1454158 :     XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
    3379              : 
    3380      1454163 : retry:
    3381              :     /* See if we need to retrieve more data */
    3382      1454163 :     if (readFile < 0 ||
    3383      1450620 :         (readSource == XLOG_FROM_STREAM &&
    3384      1438532 :          flushedUpto < targetPagePtr + reqLen))
    3385              :     {
    3386        13935 :         if (readFile >= 0 &&
    3387        10392 :             xlogreader->nonblocking &&
    3388         5104 :             readSource == XLOG_FROM_STREAM &&
    3389         5104 :             flushedUpto < targetPagePtr + reqLen)
    3390         5104 :             return XLREAD_WOULDBLOCK;
    3391              : 
    3392         8772 :         switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
    3393         8831 :                                             private->randAccess,
    3394         8831 :                                             private->fetching_ckpt,
    3395              :                                             targetRecPtr,
    3396              :                                             private->replayTLI,
    3397              :                                             xlogreader->EndRecPtr,
    3398         8831 :                                             xlogreader->nonblocking))
    3399              :         {
    3400          581 :             case XLREAD_WOULDBLOCK:
    3401          581 :                 return XLREAD_WOULDBLOCK;
    3402           48 :             case XLREAD_FAIL:
    3403           48 :                 if (readFile >= 0)
    3404            0 :                     close(readFile);
    3405           48 :                 readFile = -1;
    3406           48 :                 readLen = 0;
    3407           48 :                 readSource = XLOG_FROM_ANY;
    3408           48 :                 return XLREAD_FAIL;
    3409         8143 :             case XLREAD_SUCCESS:
    3410         8143 :                 break;
    3411              :         }
    3412              :     }
    3413              : 
    3414              :     /*
    3415              :      * At this point, we have the right segment open and if we're streaming we
    3416              :      * know the requested record is in it.
    3417              :      */
    3418              :     Assert(readFile != -1);
    3419              : 
    3420              :     /*
    3421              :      * If the current segment is being streamed from the primary, calculate
    3422              :      * how much of the current page we have received already. We know the
    3423              :      * requested record has been received, but this is for the benefit of
    3424              :      * future calls, to allow quick exit at the top of this function.
    3425              :      */
    3426      1448371 :     if (readSource == XLOG_FROM_STREAM)
    3427              :     {
    3428      1434630 :         if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
    3429      1431695 :             readLen = XLOG_BLCKSZ;
    3430              :         else
    3431         2935 :             readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
    3432              :                 targetPageOff;
    3433              :     }
    3434              :     else
    3435        13741 :         readLen = XLOG_BLCKSZ;
    3436              : 
    3437              :     /* Read the requested page */
    3438      1448371 :     readOff = targetPageOff;
    3439              : 
    3440              :     /* Measure I/O timing when reading segment */
    3441      1448371 :     io_start = pgstat_prepare_io_time(track_wal_io_timing);
    3442              : 
    3443      1448371 :     pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
    3444      1448371 :     r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff);
    3445      1448371 :     if (r != XLOG_BLCKSZ)
    3446              :     {
    3447              :         char        fname[MAXFNAMELEN];
    3448            0 :         int         save_errno = errno;
    3449              : 
    3450            0 :         pgstat_report_wait_end();
    3451              : 
    3452            0 :         pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
    3453              :                                 io_start, 1, r);
    3454              : 
    3455            0 :         XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
    3456            0 :         if (r < 0)
    3457              :         {
    3458            0 :             errno = save_errno;
    3459            0 :             ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
    3460              :                     (errcode_for_file_access(),
    3461              :                      errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
    3462              :                             fname, LSN_FORMAT_ARGS(targetPagePtr),
    3463              :                             readOff)));
    3464              :         }
    3465              :         else
    3466            0 :             ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
    3467              :                     (errcode(ERRCODE_DATA_CORRUPTED),
    3468              :                      errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
    3469              :                             fname, LSN_FORMAT_ARGS(targetPagePtr),
    3470              :                             readOff, r, (Size) XLOG_BLCKSZ)));
    3471            0 :         goto next_record_is_invalid;
    3472              :     }
    3473      1448371 :     pgstat_report_wait_end();
    3474              : 
    3475      1448371 :     pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
    3476              :                             io_start, 1, r);
    3477              : 
    3478              :     Assert(targetSegNo == readSegNo);
    3479              :     Assert(targetPageOff == readOff);
    3480              :     Assert(reqLen <= readLen);
    3481              : 
    3482      1448371 :     xlogreader->seg.ws_tli = curFileTLI;
    3483              : 
    3484              :     /*
    3485              :      * Check the page header immediately, so that we can retry immediately if
    3486              :      * it's not valid. This may seem unnecessary, because ReadPageInternal()
    3487              :      * validates the page header anyway, and would propagate the failure up to
    3488              :      * ReadRecord(), which would retry. However, there's a corner case with
    3489              :      * continuation records, if a record is split across two pages such that
    3490              :      * we would need to read the two pages from different sources across two
    3491              :      * WAL segments.
    3492              :      *
    3493              :      * The first page is only available locally, in pg_wal, because it's
    3494              :      * already been recycled on the primary. The second page, however, is not
    3495              :      * present in pg_wal, and we should stream it from the primary. There is a
    3496              :      * recycled WAL segment present in pg_wal, with garbage contents, however.
    3497              :      * We would read the first page from the local WAL segment, but when
    3498              :      * reading the second page, we would read the bogus, recycled, WAL
    3499              :      * segment. If we didn't catch that case here, we would never recover,
    3500              :      * because ReadRecord() would retry reading the whole record from the
    3501              :      * beginning.
    3502              :      *
    3503              :      * Of course, this only catches errors in the page header, which is what
    3504              :      * happens in the case of a recycled WAL segment. Other kinds of errors or
    3505              :      * corruption still has the same problem. But this at least fixes the
    3506              :      * common case, which can happen as part of normal operation.
    3507              :      *
    3508              :      * Validating the page header is cheap enough that doing it twice
    3509              :      * shouldn't be a big deal from a performance point of view.
    3510              :      *
    3511              :      * When not in standby mode, an invalid page header should cause recovery
    3512              :      * to end, not retry reading the page, so we don't need to validate the
    3513              :      * page header here for the retry. Instead, ReadPageInternal() is
    3514              :      * responsible for the validation.
    3515              :      */
    3516      1448371 :     if (StandbyMode &&
    3517      1438382 :         (targetPagePtr % wal_segment_size) == 0 &&
    3518         1414 :         !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
    3519              :     {
    3520              :         /*
    3521              :          * Emit this error right now then retry this page immediately. Use
    3522              :          * errmsg_internal() because the message was already translated.
    3523              :          */
    3524            6 :         if (xlogreader->errormsg_buf[0])
    3525            6 :             ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
    3526              :                     (errmsg_internal("%s", xlogreader->errormsg_buf)));
    3527              : 
    3528              :         /* reset any error XLogReaderValidatePageHeader() might have set */
    3529            6 :         XLogReaderResetError(xlogreader);
    3530            6 :         goto next_record_is_invalid;
    3531              :     }
    3532              : 
    3533      1448365 :     return readLen;
    3534              : 
    3535            6 : next_record_is_invalid:
    3536              : 
    3537              :     /*
    3538              :      * If we're reading ahead, give up fast.  Retries and error reporting will
    3539              :      * be handled by a later read when recovery catches up to this point.
    3540              :      */
    3541            6 :     if (xlogreader->nonblocking)
    3542            1 :         return XLREAD_WOULDBLOCK;
    3543              : 
    3544            5 :     lastSourceFailed = true;
    3545              : 
    3546            5 :     if (readFile >= 0)
    3547            5 :         close(readFile);
    3548            5 :     readFile = -1;
    3549            5 :     readLen = 0;
    3550            5 :     readSource = XLOG_FROM_ANY;
    3551              : 
    3552              :     /* In standby-mode, keep trying */
    3553            5 :     if (StandbyMode)
    3554            5 :         goto retry;
    3555              :     else
    3556            0 :         return XLREAD_FAIL;
    3557              : }
    3558              : 
    3559              : /*
    3560              :  * Open the WAL segment containing WAL location 'RecPtr'.
    3561              :  *
    3562              :  * The segment can be fetched via restore_command, or via walreceiver having
    3563              :  * streamed the record, or it can already be present in pg_wal. Checking
    3564              :  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
    3565              :  * too, in case someone copies a new segment directly to pg_wal. That is not
    3566              :  * documented or recommended, though.
    3567              :  *
    3568              :  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
    3569              :  * prepare to read WAL starting from RedoStartLSN after this.
    3570              :  *
    3571              :  * 'RecPtr' might not point to the beginning of the record we're interested
    3572              :  * in, it might also point to the page or segment header. In that case,
    3573              :  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
    3574              :  * used to decide which timeline to stream the requested WAL from.
    3575              :  *
    3576              :  * 'replayLSN' is the current replay LSN, so that if we scan for new
    3577              :  * timelines, we can reject a switch to a timeline that branched off before
    3578              :  * this point.
    3579              :  *
    3580              :  * If the record is not immediately available, the function returns XLREAD_FAIL
    3581              :  * if we're not in standby mode. In standby mode, the function waits for it to
    3582              :  * become available.
    3583              :  *
    3584              :  * When the requested record becomes available, the function opens the file
    3585              :  * containing it (if not open already), and returns XLREAD_SUCCESS. When end
    3586              :  * of standby mode is triggered by the user, and there is no more WAL
    3587              :  * available, returns XLREAD_FAIL.
    3588              :  *
    3589              :  * If nonblocking is true, then give up immediately if we can't satisfy the
    3590              :  * request, returning XLREAD_WOULDBLOCK instead of waiting.
    3591              :  */
    3592              : static XLogPageReadResult
    3593         8831 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
    3594              :                             bool fetching_ckpt, XLogRecPtr tliRecPtr,
    3595              :                             TimeLineID replayTLI, XLogRecPtr replayLSN,
    3596              :                             bool nonblocking)
    3597              : {
    3598              :     static TimestampTz last_fail_time = 0;
    3599              :     TimestampTz now;
    3600         8831 :     bool        streaming_reply_sent = false;
    3601              : 
    3602              :     /*-------
    3603              :      * Standby mode is implemented by a state machine:
    3604              :      *
    3605              :      * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
    3606              :      *    pg_wal (XLOG_FROM_PG_WAL)
    3607              :      * 2. Check for promotion trigger request
    3608              :      * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
    3609              :      * 4. Rescan timelines
    3610              :      * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
    3611              :      *
    3612              :      * Failure to read from the current source advances the state machine to
    3613              :      * the next state.
    3614              :      *
    3615              :      * 'currentSource' indicates the current state. There are no currentSource
    3616              :      * values for "check trigger", "rescan timelines", and "sleep" states,
    3617              :      * those actions are taken when reading from the previous source fails, as
    3618              :      * part of advancing to the next state.
    3619              :      *
    3620              :      * If standby mode is turned off while reading WAL from stream, we move
    3621              :      * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
    3622              :      * the files (which would be required at end of recovery, e.g., timeline
    3623              :      * history file) from archive or pg_wal. We don't need to kill WAL receiver
    3624              :      * here because it's already stopped when standby mode is turned off at
    3625              :      * the end of recovery.
    3626              :      *-------
    3627              :      */
    3628         8831 :     if (!InArchiveRecovery)
    3629         1003 :         currentSource = XLOG_FROM_PG_WAL;
    3630         7828 :     else if (currentSource == XLOG_FROM_ANY ||
    3631         7703 :              (!StandbyMode && currentSource == XLOG_FROM_STREAM))
    3632              :     {
    3633          125 :         lastSourceFailed = false;
    3634          125 :         currentSource = XLOG_FROM_ARCHIVE;
    3635              :     }
    3636              : 
    3637              :     for (;;)
    3638         7263 :     {
    3639        16094 :         XLogSource  oldSource = currentSource;
    3640        16094 :         bool        startWalReceiver = false;
    3641              : 
    3642              :         /*
    3643              :          * First check if we failed to read from the current source, and
    3644              :          * advance the state machine if so. The failure to read might've
    3645              :          * happened outside this function, e.g when a CRC check fails on a
    3646              :          * record, or within this loop.
    3647              :          */
    3648        16094 :         if (lastSourceFailed)
    3649              :         {
    3650              :             /*
    3651              :              * Don't allow any retry loops to occur during nonblocking
    3652              :              * readahead.  Let the caller process everything that has been
    3653              :              * decoded already first.
    3654              :              */
    3655          532 :             if (nonblocking)
    3656           80 :                 return XLREAD_WOULDBLOCK;
    3657              : 
    3658          452 :             switch (currentSource)
    3659              :             {
    3660          273 :                 case XLOG_FROM_ARCHIVE:
    3661              :                 case XLOG_FROM_PG_WAL:
    3662              : 
    3663              :                     /*
    3664              :                      * Check to see if promotion is requested. Note that we do
    3665              :                      * this only after failure, so when you promote, we still
    3666              :                      * finish replaying as much as we can from archive and
    3667              :                      * pg_wal before failover.
    3668              :                      */
    3669          273 :                     if (StandbyMode && CheckForStandbyTrigger())
    3670              :                     {
    3671           22 :                         XLogShutdownWalRcv();
    3672           22 :                         return XLREAD_FAIL;
    3673              :                     }
    3674              : 
    3675              :                     /*
    3676              :                      * Not in standby mode, and we've now tried the archive
    3677              :                      * and pg_wal.
    3678              :                      */
    3679          251 :                     if (!StandbyMode)
    3680           26 :                         return XLREAD_FAIL;
    3681              : 
    3682              :                     /*
    3683              :                      * Move to XLOG_FROM_STREAM state, and set to start a
    3684              :                      * walreceiver if necessary.
    3685              :                      */
    3686          225 :                     currentSource = XLOG_FROM_STREAM;
    3687          225 :                     startWalReceiver = true;
    3688          225 :                     break;
    3689              : 
    3690          179 :                 case XLOG_FROM_STREAM:
    3691              : 
    3692              :                     /*
    3693              :                      * Failure while streaming. Most likely, we got here
    3694              :                      * because streaming replication was terminated, or
    3695              :                      * promotion was triggered. But we also get here if we
    3696              :                      * find an invalid record in the WAL streamed from the
    3697              :                      * primary, in which case something is seriously wrong.
    3698              :                      * There's little chance that the problem will just go
    3699              :                      * away, but PANIC is not good for availability either,
    3700              :                      * especially in hot standby mode. So, we treat that the
    3701              :                      * same as disconnection, and retry from archive/pg_wal
    3702              :                      * again. The WAL in the archive should be identical to
    3703              :                      * what was streamed, so it's unlikely that it helps, but
    3704              :                      * one can hope...
    3705              :                      */
    3706              : 
    3707              :                     /*
    3708              :                      * We should be able to move to XLOG_FROM_STREAM only in
    3709              :                      * standby mode.
    3710              :                      */
    3711              :                     Assert(StandbyMode);
    3712              : 
    3713              :                     /*
    3714              :                      * Before we leave XLOG_FROM_STREAM state, make sure that
    3715              :                      * walreceiver is not active, so that it won't overwrite
    3716              :                      * WAL that we restore from archive.
    3717              :                      *
    3718              :                      * If walreceiver is actively streaming (or attempting to
    3719              :                      * connect), we must shut it down. However, if it's
    3720              :                      * already in WAITING state (e.g., due to timeline
    3721              :                      * divergence), we only need to reset the install flag to
    3722              :                      * allow archive restoration.
    3723              :                      */
    3724          179 :                     if (WalRcvStreaming())
    3725           34 :                         XLogShutdownWalRcv();
    3726              :                     else
    3727              :                     {
    3728              :                         /*
    3729              :                          * WALRCV_STOPPING state is a transient state while
    3730              :                          * the startup process is in ShutdownWalRcv().  It
    3731              :                          * should never appear here since we would be waiting
    3732              :                          * for the walreceiver to reach WALRCV_STOPPED in that
    3733              :                          * case.
    3734              :                          */
    3735              :                         Assert(WalRcvGetState() != WALRCV_STOPPING);
    3736          145 :                         ResetInstallXLogFileSegmentActive();
    3737              :                     }
    3738              : 
    3739              :                     /*
    3740              :                      * Before we sleep, re-scan for possible new timelines if
    3741              :                      * we were requested to recover to the latest timeline.
    3742              :                      */
    3743          179 :                     if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
    3744              :                     {
    3745          179 :                         if (rescanLatestTimeLine(replayTLI, replayLSN))
    3746              :                         {
    3747            7 :                             currentSource = XLOG_FROM_ARCHIVE;
    3748            7 :                             break;
    3749              :                         }
    3750              :                     }
    3751              : 
    3752              :                     /*
    3753              :                      * XLOG_FROM_STREAM is the last state in our state
    3754              :                      * machine, so we've exhausted all the options for
    3755              :                      * obtaining the requested WAL. We're going to loop back
    3756              :                      * and retry from the archive, but if it hasn't been long
    3757              :                      * since last attempt, sleep wal_retrieve_retry_interval
    3758              :                      * milliseconds to avoid busy-waiting.
    3759              :                      */
    3760          172 :                     now = GetCurrentTimestamp();
    3761          172 :                     if (!TimestampDifferenceExceeds(last_fail_time, now,
    3762              :                                                     wal_retrieve_retry_interval))
    3763              :                     {
    3764              :                         long        wait_time;
    3765              : 
    3766          184 :                         wait_time = wal_retrieve_retry_interval -
    3767           92 :                             TimestampDifferenceMilliseconds(last_fail_time, now);
    3768              : 
    3769           92 :                         elog(LOG, "waiting for WAL to become available at %X/%08X",
    3770              :                              LSN_FORMAT_ARGS(RecPtr));
    3771              : 
    3772              :                         /* Do background tasks that might benefit us later. */
    3773           92 :                         KnownAssignedTransactionIdsIdleMaintenance();
    3774              : 
    3775           92 :                         (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
    3776              :                                          WL_LATCH_SET | WL_TIMEOUT |
    3777              :                                          WL_EXIT_ON_PM_DEATH,
    3778              :                                          wait_time,
    3779              :                                          WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
    3780           92 :                         ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
    3781           92 :                         now = GetCurrentTimestamp();
    3782              : 
    3783              :                         /* Handle interrupt signals of startup process */
    3784           92 :                         ProcessStartupProcInterrupts();
    3785              :                     }
    3786          158 :                     last_fail_time = now;
    3787          158 :                     currentSource = XLOG_FROM_ARCHIVE;
    3788          158 :                     break;
    3789              : 
    3790            0 :                 default:
    3791            0 :                     elog(ERROR, "unexpected WAL source %d", currentSource);
    3792              :             }
    3793              :         }
    3794        15562 :         else if (currentSource == XLOG_FROM_PG_WAL)
    3795              :         {
    3796              :             /*
    3797              :              * We just successfully read a file in pg_wal. We prefer files in
    3798              :              * the archive over ones in pg_wal, so try the next file again
    3799              :              * from the archive first.
    3800              :              */
    3801          999 :             if (InArchiveRecovery)
    3802            0 :                 currentSource = XLOG_FROM_ARCHIVE;
    3803              :         }
    3804              : 
    3805        15952 :         if (currentSource != oldSource)
    3806          390 :             elog(DEBUG2, "switched WAL source from %s to %s after %s",
    3807              :                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
    3808              :                  lastSourceFailed ? "failure" : "success");
    3809              : 
    3810              :         /*
    3811              :          * We've now handled possible failure. Try to read from the chosen
    3812              :          * source.
    3813              :          */
    3814        15952 :         lastSourceFailed = false;
    3815              : 
    3816        15952 :         switch (currentSource)
    3817              :         {
    3818         1843 :             case XLOG_FROM_ARCHIVE:
    3819              :             case XLOG_FROM_PG_WAL:
    3820              : 
    3821              :                 /*
    3822              :                  * WAL receiver must not be running when reading WAL from
    3823              :                  * archive or pg_wal.
    3824              :                  */
    3825              :                 Assert(!WalRcvStreaming());
    3826              : 
    3827              :                 /* Close any old file we might have open. */
    3828         1843 :                 if (readFile >= 0)
    3829              :                 {
    3830           87 :                     close(readFile);
    3831           87 :                     readFile = -1;
    3832              :                 }
    3833              :                 /* Reset curFileTLI if random fetch. */
    3834         1843 :                 if (randAccess)
    3835         1166 :                     curFileTLI = 0;
    3836              : 
    3837              :                 /*
    3838              :                  * Try to restore the file from archive, or read an existing
    3839              :                  * file from pg_wal.
    3840              :                  */
    3841         1843 :                 readFile = XLogFileReadAnyTLI(readSegNo,
    3842         1843 :                                               currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
    3843              :                                               currentSource);
    3844         1842 :                 if (readFile >= 0)
    3845         1653 :                     return XLREAD_SUCCESS;  /* success! */
    3846              : 
    3847              :                 /*
    3848              :                  * Nope, not found in archive or pg_wal.
    3849              :                  */
    3850          189 :                 lastSourceFailed = true;
    3851          189 :                 break;
    3852              : 
    3853        14109 :             case XLOG_FROM_STREAM:
    3854              :                 {
    3855              :                     bool        havedata;
    3856              : 
    3857              :                     /*
    3858              :                      * We should be able to move to XLOG_FROM_STREAM only in
    3859              :                      * standby mode.
    3860              :                      */
    3861              :                     Assert(StandbyMode);
    3862              : 
    3863              :                     /*
    3864              :                      * First, shutdown walreceiver if its restart has been
    3865              :                      * requested -- but no point if we're already slated for
    3866              :                      * starting it.
    3867              :                      */
    3868        14109 :                     if (pendingWalRcvRestart && !startWalReceiver)
    3869              :                     {
    3870            7 :                         XLogShutdownWalRcv();
    3871              : 
    3872              :                         /*
    3873              :                          * Re-scan for possible new timelines if we were
    3874              :                          * requested to recover to the latest timeline.
    3875              :                          */
    3876            7 :                         if (recoveryTargetTimeLineGoal ==
    3877              :                             RECOVERY_TARGET_TIMELINE_LATEST)
    3878            7 :                             rescanLatestTimeLine(replayTLI, replayLSN);
    3879              : 
    3880            7 :                         startWalReceiver = true;
    3881              :                     }
    3882        14109 :                     pendingWalRcvRestart = false;
    3883              : 
    3884              :                     /*
    3885              :                      * Launch walreceiver if needed.
    3886              :                      *
    3887              :                      * If fetching_ckpt is true, RecPtr points to the initial
    3888              :                      * checkpoint location. In that case, we use RedoStartLSN
    3889              :                      * as the streaming start position instead of RecPtr, so
    3890              :                      * that when we later jump backwards to start redo at
    3891              :                      * RedoStartLSN, we will have the logs streamed already.
    3892              :                      */
    3893        14109 :                     if (startWalReceiver &&
    3894          232 :                         PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
    3895              :                     {
    3896              :                         XLogRecPtr  ptr;
    3897              :                         TimeLineID  tli;
    3898              : 
    3899          194 :                         if (fetching_ckpt)
    3900              :                         {
    3901            0 :                             ptr = RedoStartLSN;
    3902            0 :                             tli = RedoStartTLI;
    3903              :                         }
    3904              :                         else
    3905              :                         {
    3906          194 :                             ptr = RecPtr;
    3907              : 
    3908              :                             /*
    3909              :                              * Use the record begin position to determine the
    3910              :                              * TLI, rather than the position we're reading.
    3911              :                              */
    3912          194 :                             tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
    3913              : 
    3914          194 :                             if (curFileTLI > 0 && tli < curFileTLI)
    3915            0 :                                 elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
    3916              :                                      LSN_FORMAT_ARGS(tliRecPtr),
    3917              :                                      tli, curFileTLI);
    3918              :                         }
    3919          194 :                         curFileTLI = tli;
    3920          194 :                         SetInstallXLogFileSegmentActive();
    3921          194 :                         RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
    3922              :                                              PrimarySlotName,
    3923              :                                              wal_receiver_create_temp_slot);
    3924          194 :                         flushedUpto = InvalidXLogRecPtr;
    3925              :                     }
    3926              : 
    3927              :                     /*
    3928              :                      * Check if WAL receiver is active or wait to start up.
    3929              :                      */
    3930        14109 :                     if (!WalRcvStreaming())
    3931              :                     {
    3932          145 :                         lastSourceFailed = true;
    3933          145 :                         break;
    3934              :                     }
    3935              : 
    3936              :                     /*
    3937              :                      * Walreceiver is active, so see if new data has arrived.
    3938              :                      *
    3939              :                      * We only advance XLogReceiptTime when we obtain fresh
    3940              :                      * WAL from walreceiver and observe that we had already
    3941              :                      * processed everything before the most recent "chunk"
    3942              :                      * that it flushed to disk.  In steady state where we are
    3943              :                      * keeping up with the incoming data, XLogReceiptTime will
    3944              :                      * be updated on each cycle. When we are behind,
    3945              :                      * XLogReceiptTime will not advance, so the grace time
    3946              :                      * allotted to conflicting queries will decrease.
    3947              :                      */
    3948        13964 :                     if (RecPtr < flushedUpto)
    3949         1922 :                         havedata = true;
    3950              :                     else
    3951              :                     {
    3952              :                         XLogRecPtr  latestChunkStart;
    3953              : 
    3954        12042 :                         flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
    3955        12042 :                         if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
    3956              :                         {
    3957         5890 :                             havedata = true;
    3958         5890 :                             if (latestChunkStart <= RecPtr)
    3959              :                             {
    3960         3632 :                                 XLogReceiptTime = GetCurrentTimestamp();
    3961         3632 :                                 SetCurrentChunkStartTime(XLogReceiptTime);
    3962              :                             }
    3963              :                         }
    3964              :                         else
    3965         6152 :                             havedata = false;
    3966              :                     }
    3967        13964 :                     if (havedata)
    3968              :                     {
    3969              :                         /*
    3970              :                          * Great, streamed far enough.  Open the file if it's
    3971              :                          * not open already.  Also read the timeline history
    3972              :                          * file if we haven't initialized timeline history
    3973              :                          * yet; it should be streamed over and present in
    3974              :                          * pg_wal by now.  Use XLOG_FROM_STREAM so that source
    3975              :                          * info is set correctly and XLogReceiptTime isn't
    3976              :                          * changed.
    3977              :                          *
    3978              :                          * NB: We must set readTimeLineHistory based on
    3979              :                          * recoveryTargetTLI, not receiveTLI. Normally they'll
    3980              :                          * be the same, but if recovery_target_timeline is
    3981              :                          * 'latest' and archiving is configured, then it's
    3982              :                          * possible that we managed to retrieve one or more
    3983              :                          * new timeline history files from the archive,
    3984              :                          * updating recoveryTargetTLI.
    3985              :                          */
    3986         7812 :                         if (readFile < 0)
    3987              :                         {
    3988         1322 :                             if (!expectedTLEs)
    3989            0 :                                 expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
    3990         1322 :                             readFile = XLogFileRead(readSegNo, receiveTLI,
    3991              :                                                     XLOG_FROM_STREAM, false);
    3992              :                             Assert(readFile >= 0);
    3993              :                         }
    3994              :                         else
    3995              :                         {
    3996              :                             /* just make sure source info is correct... */
    3997         6490 :                             readSource = XLOG_FROM_STREAM;
    3998         6490 :                             XLogReceiptSource = XLOG_FROM_STREAM;
    3999         6490 :                             return XLREAD_SUCCESS;
    4000              :                         }
    4001         1322 :                         break;
    4002              :                     }
    4003              : 
    4004              :                     /* In nonblocking mode, return rather than sleeping. */
    4005         6152 :                     if (nonblocking)
    4006          501 :                         return XLREAD_WOULDBLOCK;
    4007              : 
    4008              :                     /*
    4009              :                      * Data not here yet. Check for trigger, then wait for
    4010              :                      * walreceiver to wake us up when new WAL arrives.
    4011              :                      */
    4012         5651 :                     if (CheckForStandbyTrigger())
    4013              :                     {
    4014              :                         /*
    4015              :                          * Note that we don't return XLREAD_FAIL immediately
    4016              :                          * here. After being triggered, we still want to
    4017              :                          * replay all the WAL that was already streamed. It's
    4018              :                          * in pg_wal now, so we just treat this as a failure,
    4019              :                          * and the state machine will move on to replay the
    4020              :                          * streamed WAL from pg_wal, and then recheck the
    4021              :                          * trigger and exit replay.
    4022              :                          */
    4023           34 :                         lastSourceFailed = true;
    4024           34 :                         break;
    4025              :                     }
    4026              : 
    4027              :                     /*
    4028              :                      * Since we have replayed everything we have received so
    4029              :                      * far and are about to start waiting for more WAL, let's
    4030              :                      * tell the upstream server our replay location now so
    4031              :                      * that pg_stat_replication doesn't show stale
    4032              :                      * information.
    4033              :                      */
    4034         5617 :                     if (!streaming_reply_sent)
    4035              :                     {
    4036         4336 :                         WalRcvForceReply();
    4037         4336 :                         streaming_reply_sent = true;
    4038              :                     }
    4039              : 
    4040              :                     /* Do any background tasks that might benefit us later. */
    4041         5617 :                     KnownAssignedTransactionIdsIdleMaintenance();
    4042              : 
    4043              :                     /* Update pg_stat_recovery_prefetch before sleeping. */
    4044         5617 :                     XLogPrefetcherComputeStats(xlogprefetcher);
    4045              : 
    4046              :                     /*
    4047              :                      * Wait for more WAL to arrive, when we will be woken
    4048              :                      * immediately by the WAL receiver.
    4049              :                      */
    4050         5617 :                     (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
    4051              :                                      WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
    4052              :                                      -1L,
    4053              :                                      WAIT_EVENT_RECOVERY_WAL_STREAM);
    4054         5617 :                     ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
    4055         5617 :                     break;
    4056              :                 }
    4057              : 
    4058            0 :             default:
    4059            0 :                 elog(ERROR, "unexpected WAL source %d", currentSource);
    4060              :         }
    4061              : 
    4062              :         /*
    4063              :          * Check for recovery pause here so that we can confirm more quickly
    4064              :          * that a requested pause has actually taken effect.
    4065              :          */
    4066         7307 :         if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
    4067              :             RECOVERY_NOT_PAUSED)
    4068            3 :             recoveryPausesHere(false);
    4069              : 
    4070              :         /*
    4071              :          * This possibly-long loop needs to handle interrupts of startup
    4072              :          * process.
    4073              :          */
    4074         7307 :         ProcessStartupProcInterrupts();
    4075              :     }
    4076              : 
    4077              :     return XLREAD_FAIL;         /* not reached */
    4078              : }
    4079              : 
    4080              : 
    4081              : /*
    4082              :  * Determine what log level should be used to report a corrupt WAL record
    4083              :  * in the current WAL page, previously read by XLogPageRead().
    4084              :  *
    4085              :  * 'emode' is the error mode that would be used to report a file-not-found
    4086              :  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
    4087              :  * we're retrying the exact same record that we've tried previously, only
    4088              :  * complain the first time to keep the noise down.  However, we only do when
    4089              :  * reading from pg_wal, because we don't expect any invalid records in archive
    4090              :  * or in records streamed from the primary. Files in the archive should be complete,
    4091              :  * and we should never hit the end of WAL because we stop and wait for more WAL
    4092              :  * to arrive before replaying it.
    4093              :  *
    4094              :  * NOTE: This function remembers the RecPtr value it was last called with,
    4095              :  * to suppress repeated messages about the same record. Only call this when
    4096              :  * you are about to ereport(), or you might cause a later message to be
    4097              :  * erroneously suppressed.
    4098              :  */
    4099              : static int
    4100          277 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
    4101              : {
    4102              :     static XLogRecPtr lastComplaint = InvalidXLogRecPtr;
    4103              : 
    4104          277 :     if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
    4105              :     {
    4106          273 :         if (RecPtr == lastComplaint)
    4107           68 :             emode = DEBUG1;
    4108              :         else
    4109          205 :             lastComplaint = RecPtr;
    4110              :     }
    4111          277 :     return emode;
    4112              : }
    4113              : 
    4114              : 
    4115              : /*
    4116              :  * Subroutine to try to fetch and validate a prior checkpoint record.
    4117              :  */
    4118              : static XLogRecord *
    4119         1006 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
    4120              :                      TimeLineID replayTLI)
    4121              : {
    4122              :     XLogRecord *record;
    4123              :     uint8       info;
    4124              : 
    4125              :     Assert(xlogreader != NULL);
    4126              : 
    4127         1006 :     if (!XRecOffIsValid(RecPtr))
    4128              :     {
    4129            0 :         ereport(LOG,
    4130              :                 (errmsg("invalid checkpoint location")));
    4131            0 :         return NULL;
    4132              :     }
    4133              : 
    4134         1006 :     XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
    4135         1006 :     record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
    4136              : 
    4137         1006 :     if (record == NULL)
    4138              :     {
    4139            0 :         ereport(LOG,
    4140              :                 (errmsg("invalid checkpoint record")));
    4141            0 :         return NULL;
    4142              :     }
    4143         1006 :     if (record->xl_rmid != RM_XLOG_ID)
    4144              :     {
    4145            0 :         ereport(LOG,
    4146              :                 (errmsg("invalid resource manager ID in checkpoint record")));
    4147            0 :         return NULL;
    4148              :     }
    4149         1006 :     info = record->xl_info & ~XLR_INFO_MASK;
    4150         1006 :     if (info != XLOG_CHECKPOINT_SHUTDOWN &&
    4151              :         info != XLOG_CHECKPOINT_ONLINE)
    4152              :     {
    4153            0 :         ereport(LOG,
    4154              :                 (errmsg("invalid xl_info in checkpoint record")));
    4155            0 :         return NULL;
    4156              :     }
    4157         1006 :     if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
    4158              :     {
    4159            0 :         ereport(LOG,
    4160              :                 (errmsg("invalid length of checkpoint record")));
    4161            0 :         return NULL;
    4162              :     }
    4163         1006 :     return record;
    4164              : }
    4165              : 
    4166              : /*
    4167              :  * Scan for new timelines that might have appeared in the archive since we
    4168              :  * started recovery.
    4169              :  *
    4170              :  * If there are any, the function changes recovery target TLI to the latest
    4171              :  * one and returns 'true'.
    4172              :  */
    4173              : static bool
    4174          186 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
    4175              : {
    4176              :     List       *newExpectedTLEs;
    4177              :     bool        found;
    4178              :     ListCell   *cell;
    4179              :     TimeLineID  newtarget;
    4180          186 :     TimeLineID  oldtarget = recoveryTargetTLI;
    4181          186 :     TimeLineHistoryEntry *currentTle = NULL;
    4182              : 
    4183          186 :     newtarget = findNewestTimeLine(recoveryTargetTLI);
    4184          186 :     if (newtarget == recoveryTargetTLI)
    4185              :     {
    4186              :         /* No new timelines found */
    4187          179 :         return false;
    4188              :     }
    4189              : 
    4190              :     /*
    4191              :      * Determine the list of expected TLIs for the new TLI
    4192              :      */
    4193              : 
    4194            7 :     newExpectedTLEs = readTimeLineHistory(newtarget);
    4195              : 
    4196              :     /*
    4197              :      * If the current timeline is not part of the history of the new timeline,
    4198              :      * we cannot proceed to it.
    4199              :      */
    4200            7 :     found = false;
    4201           14 :     foreach(cell, newExpectedTLEs)
    4202              :     {
    4203           14 :         currentTle = (TimeLineHistoryEntry *) lfirst(cell);
    4204              : 
    4205           14 :         if (currentTle->tli == recoveryTargetTLI)
    4206              :         {
    4207            7 :             found = true;
    4208            7 :             break;
    4209              :         }
    4210              :     }
    4211            7 :     if (!found)
    4212              :     {
    4213            0 :         ereport(LOG,
    4214              :                 (errmsg("new timeline %u is not a child of database system timeline %u",
    4215              :                         newtarget,
    4216              :                         replayTLI)));
    4217            0 :         return false;
    4218              :     }
    4219              : 
    4220              :     /*
    4221              :      * The current timeline was found in the history file, but check that the
    4222              :      * next timeline was forked off from it *after* the current recovery
    4223              :      * location.
    4224              :      */
    4225            7 :     if (currentTle->end < replayLSN)
    4226              :     {
    4227            0 :         ereport(LOG,
    4228              :                 errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
    4229              :                        newtarget,
    4230              :                        replayTLI,
    4231              :                        LSN_FORMAT_ARGS(replayLSN)));
    4232            0 :         return false;
    4233              :     }
    4234              : 
    4235              :     /* The new timeline history seems valid. Switch target */
    4236            7 :     recoveryTargetTLI = newtarget;
    4237            7 :     list_free_deep(expectedTLEs);
    4238            7 :     expectedTLEs = newExpectedTLEs;
    4239              : 
    4240              :     /*
    4241              :      * As in StartupXLOG(), try to ensure we have all the history files
    4242              :      * between the old target and new target in pg_wal.
    4243              :      */
    4244            7 :     restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
    4245              : 
    4246            7 :     ereport(LOG,
    4247              :             (errmsg("new target timeline is %u",
    4248              :                     recoveryTargetTLI)));
    4249              : 
    4250            7 :     return true;
    4251              : }
    4252              : 
    4253              : 
    4254              : /*
    4255              :  * Open a logfile segment for reading (during recovery).
    4256              :  *
    4257              :  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
    4258              :  * Otherwise, it's assumed to be already available in pg_wal.
    4259              :  */
    4260              : static int
    4261         3666 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
    4262              :              XLogSource source, bool notfoundOk)
    4263              : {
    4264              :     char        xlogfname[MAXFNAMELEN];
    4265              :     char        activitymsg[MAXFNAMELEN + 16];
    4266              :     char        path[MAXPGPATH];
    4267              :     int         fd;
    4268              : 
    4269         3666 :     XLogFileName(xlogfname, tli, segno, wal_segment_size);
    4270              : 
    4271         3666 :     switch (source)
    4272              :     {
    4273          855 :         case XLOG_FROM_ARCHIVE:
    4274              :             /* Report recovery progress in PS display */
    4275          855 :             snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
    4276              :                      xlogfname);
    4277          855 :             set_ps_display(activitymsg);
    4278              : 
    4279          855 :             if (!RestoreArchivedFile(path, xlogfname,
    4280              :                                      "RECOVERYXLOG",
    4281              :                                      wal_segment_size,
    4282              :                                      InRedo))
    4283          490 :                 return -1;
    4284          364 :             break;
    4285              : 
    4286         2811 :         case XLOG_FROM_PG_WAL:
    4287              :         case XLOG_FROM_STREAM:
    4288         2811 :             XLogFilePath(path, tli, segno, wal_segment_size);
    4289         2811 :             break;
    4290              : 
    4291            0 :         default:
    4292            0 :             elog(ERROR, "invalid XLogFileRead source %d", source);
    4293              :     }
    4294              : 
    4295              :     /*
    4296              :      * If the segment was fetched from archival storage, replace the existing
    4297              :      * xlog segment (if any) with the archival version.
    4298              :      */
    4299         3175 :     if (source == XLOG_FROM_ARCHIVE)
    4300              :     {
    4301              :         Assert(!IsInstallXLogFileSegmentActive());
    4302          364 :         KeepFileRestoredFromArchive(path, xlogfname);
    4303              : 
    4304              :         /*
    4305              :          * Set path to point at the new file in pg_wal.
    4306              :          */
    4307          364 :         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
    4308              :     }
    4309              : 
    4310         3175 :     fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
    4311         3175 :     if (fd >= 0)
    4312              :     {
    4313              :         /* Success! */
    4314         2975 :         curFileTLI = tli;
    4315              : 
    4316              :         /* Report recovery progress in PS display */
    4317         2975 :         snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
    4318              :                  xlogfname);
    4319         2975 :         set_ps_display(activitymsg);
    4320              : 
    4321              :         /* Track source of data in assorted state variables */
    4322         2975 :         readSource = source;
    4323         2975 :         XLogReceiptSource = source;
    4324              :         /* In FROM_STREAM case, caller tracks receipt time, not me */
    4325         2975 :         if (source != XLOG_FROM_STREAM)
    4326         1653 :             XLogReceiptTime = GetCurrentTimestamp();
    4327              : 
    4328         2975 :         return fd;
    4329              :     }
    4330          200 :     if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
    4331            0 :         ereport(PANIC,
    4332              :                 (errcode_for_file_access(),
    4333              :                  errmsg("could not open file \"%s\": %m", path)));
    4334          200 :     return -1;
    4335              : }
    4336              : 
    4337              : /*
    4338              :  * Open a logfile segment for reading (during recovery).
    4339              :  *
    4340              :  * This version searches for the segment with any TLI listed in expectedTLEs.
    4341              :  */
    4342              : static int
    4343         1843 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
    4344              : {
    4345              :     char        path[MAXPGPATH];
    4346              :     ListCell   *cell;
    4347              :     int         fd;
    4348              :     List       *tles;
    4349              : 
    4350              :     /*
    4351              :      * Loop looking for a suitable timeline ID: we might need to read any of
    4352              :      * the timelines listed in expectedTLEs.
    4353              :      *
    4354              :      * We expect curFileTLI on entry to be the TLI of the preceding file in
    4355              :      * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
    4356              :      * to go backwards; this prevents us from picking up the wrong file when a
    4357              :      * parent timeline extends to higher segment numbers than the child we
    4358              :      * want to read.
    4359              :      *
    4360              :      * If we haven't read the timeline history file yet, read it now, so that
    4361              :      * we know which TLIs to scan.  We don't save the list in expectedTLEs,
    4362              :      * however, unless we actually find a valid segment.  That way if there is
    4363              :      * neither a timeline history file nor a WAL segment in the archive, and
    4364              :      * streaming replication is set up, we'll read the timeline history file
    4365              :      * streamed from the primary when we start streaming, instead of
    4366              :      * recovering with a dummy history generated here.
    4367              :      */
    4368         1843 :     if (expectedTLEs)
    4369          837 :         tles = expectedTLEs;
    4370              :     else
    4371         1006 :         tles = readTimeLineHistory(recoveryTargetTLI);
    4372              : 
    4373         2050 :     foreach(cell, tles)
    4374              :     {
    4375         1867 :         TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
    4376         1867 :         TimeLineID  tli = hent->tli;
    4377              : 
    4378         1867 :         if (tli < curFileTLI)
    4379            6 :             break;              /* don't bother looking at too-old TLIs */
    4380              : 
    4381              :         /*
    4382              :          * Skip scanning the timeline ID that the logfile segment to read
    4383              :          * doesn't belong to
    4384              :          */
    4385         1861 :         if (XLogRecPtrIsValid(hent->begin))
    4386              :         {
    4387           77 :             XLogSegNo   beginseg = 0;
    4388              : 
    4389           77 :             XLByteToSeg(hent->begin, beginseg, wal_segment_size);
    4390              : 
    4391              :             /*
    4392              :              * The logfile segment that doesn't belong to the timeline is
    4393              :              * older or newer than the segment that the timeline started or
    4394              :              * ended at, respectively. It's sufficient to check only the
    4395              :              * starting segment of the timeline here. Since the timelines are
    4396              :              * scanned in descending order in this loop, any segments newer
    4397              :              * than the ending segment should belong to newer timeline and
    4398              :              * have already been read before. So it's not necessary to check
    4399              :              * the ending segment of the timeline here.
    4400              :              */
    4401           77 :             if (segno < beginseg)
    4402            7 :                 continue;
    4403              :         }
    4404              : 
    4405         1854 :         if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
    4406              :         {
    4407          855 :             fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
    4408          854 :             if (fd != -1)
    4409              :             {
    4410          364 :                 elog(DEBUG1, "got WAL segment from archive");
    4411          364 :                 if (!expectedTLEs)
    4412           19 :                     expectedTLEs = tles;
    4413         1653 :                 return fd;
    4414              :             }
    4415              :         }
    4416              : 
    4417         1489 :         if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
    4418              :         {
    4419         1489 :             fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
    4420         1489 :             if (fd != -1)
    4421              :             {
    4422         1289 :                 if (!expectedTLEs)
    4423          987 :                     expectedTLEs = tles;
    4424         1289 :                 return fd;
    4425              :             }
    4426              :         }
    4427              :     }
    4428              : 
    4429              :     /* Couldn't find it.  For simplicity, complain about front timeline */
    4430          189 :     XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
    4431          189 :     errno = ENOENT;
    4432          189 :     ereport(DEBUG2,
    4433              :             (errcode_for_file_access(),
    4434              :              errmsg("could not open file \"%s\": %m", path)));
    4435          189 :     return -1;
    4436              : }
    4437              : 
    4438              : /*
    4439              :  * Set flag to signal the walreceiver to restart.  (The startup process calls
    4440              :  * this on noticing a relevant configuration change.)
    4441              :  */
    4442              : void
    4443           11 : StartupRequestWalReceiverRestart(void)
    4444              : {
    4445           11 :     if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
    4446              :     {
    4447            7 :         ereport(LOG,
    4448              :                 (errmsg("WAL receiver process shutdown requested")));
    4449              : 
    4450            7 :         pendingWalRcvRestart = true;
    4451              :     }
    4452           11 : }
    4453              : 
    4454              : 
    4455              : /*
    4456              :  * Has a standby promotion already been triggered?
    4457              :  *
    4458              :  * Unlike CheckForStandbyTrigger(), this works in any process
    4459              :  * that's connected to shared memory.
    4460              :  */
    4461              : bool
    4462           72 : PromoteIsTriggered(void)
    4463              : {
    4464              :     /*
    4465              :      * We check shared state each time only until a standby promotion is
    4466              :      * triggered. We can't trigger a promotion again, so there's no need to
    4467              :      * keep checking after the shared variable has once been seen true.
    4468              :      */
    4469           72 :     if (LocalPromoteIsTriggered)
    4470           51 :         return true;
    4471              : 
    4472           21 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4473           21 :     LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
    4474           21 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4475              : 
    4476           21 :     return LocalPromoteIsTriggered;
    4477              : }
    4478              : 
    4479              : static void
    4480           48 : SetPromoteIsTriggered(void)
    4481              : {
    4482           48 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4483           48 :     XLogRecoveryCtl->SharedPromoteIsTriggered = true;
    4484           48 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4485              : 
    4486              :     /*
    4487              :      * Mark the recovery pause state as 'not paused' because the paused state
    4488              :      * ends and promotion continues if a promotion is triggered while recovery
    4489              :      * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
    4490              :      * return 'paused' while a promotion is ongoing.
    4491              :      */
    4492           48 :     SetRecoveryPause(false);
    4493              : 
    4494           48 :     LocalPromoteIsTriggered = true;
    4495           48 : }
    4496              : 
    4497              : /*
    4498              :  * Check whether a promote request has arrived.
    4499              :  */
    4500              : static bool
    4501         6166 : CheckForStandbyTrigger(void)
    4502              : {
    4503         6166 :     if (LocalPromoteIsTriggered)
    4504           57 :         return true;
    4505              : 
    4506         6109 :     if (IsPromoteSignaled() && CheckPromoteSignal())
    4507              :     {
    4508           48 :         ereport(LOG, (errmsg("received promote request")));
    4509           48 :         RemovePromoteSignalFiles();
    4510           48 :         ResetPromoteSignaled();
    4511           48 :         SetPromoteIsTriggered();
    4512           48 :         return true;
    4513              :     }
    4514              : 
    4515         6061 :     return false;
    4516              : }
    4517              : 
    4518              : /*
    4519              :  * Remove the files signaling a standby promotion request.
    4520              :  */
    4521              : void
    4522          970 : RemovePromoteSignalFiles(void)
    4523              : {
    4524          970 :     unlink(PROMOTE_SIGNAL_FILE);
    4525          970 : }
    4526              : 
    4527              : /*
    4528              :  * Check to see if a promote request has arrived.
    4529              :  */
    4530              : bool
    4531          747 : CheckPromoteSignal(void)
    4532              : {
    4533              :     struct stat stat_buf;
    4534              : 
    4535          747 :     if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
    4536           96 :         return true;
    4537              : 
    4538          651 :     return false;
    4539              : }
    4540              : 
    4541              : /*
    4542              :  * Wake up startup process to replay newly arrived WAL, or to notice that
    4543              :  * failover has been requested.
    4544              :  */
    4545              : void
    4546        24435 : WakeupRecovery(void)
    4547              : {
    4548        24435 :     SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
    4549        24435 : }
    4550              : 
    4551              : /*
    4552              :  * Schedule a walreceiver wakeup in the main recovery loop.
    4553              :  */
    4554              : void
    4555            2 : XLogRequestWalReceiverReply(void)
    4556              : {
    4557            2 :     doRequestWalReceiverReply = true;
    4558            2 : }
    4559              : 
    4560              : /*
    4561              :  * Is HotStandby active yet? This is only important in special backends
    4562              :  * since normal backends won't ever be able to connect until this returns
    4563              :  * true. Postmaster knows this by way of signal, not via shared memory.
    4564              :  *
    4565              :  * Unlike testing standbyState, this works in any process that's connected to
    4566              :  * shared memory.  (And note that standbyState alone doesn't tell the truth
    4567              :  * anyway.)
    4568              :  */
    4569              : bool
    4570          166 : HotStandbyActive(void)
    4571              : {
    4572              :     /*
    4573              :      * We check shared state each time only until Hot Standby is active. We
    4574              :      * can't de-activate Hot Standby, so there's no need to keep checking
    4575              :      * after the shared variable has once been seen true.
    4576              :      */
    4577          166 :     if (LocalHotStandbyActive)
    4578           23 :         return true;
    4579              :     else
    4580              :     {
    4581              :         /* spinlock is essential on machines with weak memory ordering! */
    4582          143 :         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4583          143 :         LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
    4584          143 :         SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4585              : 
    4586          143 :         return LocalHotStandbyActive;
    4587              :     }
    4588              : }
    4589              : 
    4590              : /*
    4591              :  * Like HotStandbyActive(), but to be used only in WAL replay code,
    4592              :  * where we don't need to ask any other process what the state is.
    4593              :  */
    4594              : static bool
    4595            0 : HotStandbyActiveInReplay(void)
    4596              : {
    4597              :     Assert(AmStartupProcess() || !IsPostmasterEnvironment);
    4598            0 :     return LocalHotStandbyActive;
    4599              : }
    4600              : 
    4601              : /*
    4602              :  * Get latest redo apply position.
    4603              :  *
    4604              :  * Exported to allow WALReceiver to read the pointer directly.
    4605              :  */
    4606              : XLogRecPtr
    4607        62212 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
    4608              : {
    4609              :     XLogRecPtr  recptr;
    4610              :     TimeLineID  tli;
    4611              : 
    4612        62212 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4613        62212 :     recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
    4614        62212 :     tli = XLogRecoveryCtl->lastReplayedTLI;
    4615        62212 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4616              : 
    4617        62212 :     if (replayTLI)
    4618         2134 :         *replayTLI = tli;
    4619        62212 :     return recptr;
    4620              : }
    4621              : 
    4622              : 
    4623              : /*
    4624              :  * Get position of last applied, or the record being applied.
    4625              :  *
    4626              :  * This is different from GetXLogReplayRecPtr() in that if a WAL
    4627              :  * record is currently being applied, this includes that record.
    4628              :  */
    4629              : XLogRecPtr
    4630         6388 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
    4631              : {
    4632              :     XLogRecPtr  recptr;
    4633              :     TimeLineID  tli;
    4634              : 
    4635         6388 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4636         6388 :     recptr = XLogRecoveryCtl->replayEndRecPtr;
    4637         6388 :     tli = XLogRecoveryCtl->replayEndTLI;
    4638         6388 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4639              : 
    4640         6388 :     if (replayEndTLI)
    4641         6388 :         *replayEndTLI = tli;
    4642         6388 :     return recptr;
    4643              : }
    4644              : 
    4645              : /*
    4646              :  * Save timestamp of latest processed commit/abort record.
    4647              :  *
    4648              :  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
    4649              :  * seen by processes other than the startup process.  Note in particular
    4650              :  * that CreateRestartPoint is executed in the checkpointer.
    4651              :  */
    4652              : static void
    4653        22398 : SetLatestXTime(TimestampTz xtime)
    4654              : {
    4655        22398 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4656        22398 :     XLogRecoveryCtl->recoveryLastXTime = xtime;
    4657        22398 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4658        22398 : }
    4659              : 
    4660              : /*
    4661              :  * Fetch timestamp of latest processed commit/abort record.
    4662              :  */
    4663              : TimestampTz
    4664          349 : GetLatestXTime(void)
    4665              : {
    4666              :     TimestampTz xtime;
    4667              : 
    4668          349 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4669          349 :     xtime = XLogRecoveryCtl->recoveryLastXTime;
    4670          349 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4671              : 
    4672          349 :     return xtime;
    4673              : }
    4674              : 
    4675              : /*
    4676              :  * Save timestamp of the next chunk of WAL records to apply.
    4677              :  *
    4678              :  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
    4679              :  * seen by all backends.
    4680              :  */
    4681              : static void
    4682         3632 : SetCurrentChunkStartTime(TimestampTz xtime)
    4683              : {
    4684         3632 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4685         3632 :     XLogRecoveryCtl->currentChunkStartTime = xtime;
    4686         3632 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4687         3632 : }
    4688              : 
    4689              : /*
    4690              :  * Fetch timestamp of latest processed commit/abort record.
    4691              :  * Startup process maintains an accurate local copy in XLogReceiptTime
    4692              :  */
    4693              : TimestampTz
    4694          246 : GetCurrentChunkReplayStartTime(void)
    4695              : {
    4696              :     TimestampTz xtime;
    4697              : 
    4698          246 :     SpinLockAcquire(&XLogRecoveryCtl->info_lck);
    4699          246 :     xtime = XLogRecoveryCtl->currentChunkStartTime;
    4700          246 :     SpinLockRelease(&XLogRecoveryCtl->info_lck);
    4701              : 
    4702          246 :     return xtime;
    4703              : }
    4704              : 
    4705              : /*
    4706              :  * Returns time of receipt of current chunk of XLOG data, as well as
    4707              :  * whether it was received from streaming replication or from archives.
    4708              :  */
    4709              : void
    4710           29 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
    4711              : {
    4712              :     /*
    4713              :      * This must be executed in the startup process, since we don't export the
    4714              :      * relevant state to shared memory.
    4715              :      */
    4716              :     Assert(InRecovery);
    4717              : 
    4718           29 :     *rtime = XLogReceiptTime;
    4719           29 :     *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
    4720           29 : }
    4721              : 
    4722              : /*
    4723              :  * Note that text field supplied is a parameter name and does not require
    4724              :  * translation
    4725              :  */
    4726              : void
    4727          685 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
    4728              : {
    4729          685 :     if (currValue < minValue)
    4730              :     {
    4731            0 :         if (HotStandbyActiveInReplay())
    4732              :         {
    4733            0 :             bool        warned_for_promote = false;
    4734              : 
    4735            0 :             ereport(WARNING,
    4736              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4737              :                      errmsg("hot standby is not possible because of insufficient parameter settings"),
    4738              :                      errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
    4739              :                                param_name,
    4740              :                                currValue,
    4741              :                                minValue)));
    4742              : 
    4743            0 :             SetRecoveryPause(true);
    4744              : 
    4745            0 :             ereport(LOG,
    4746              :                     (errmsg("recovery has paused"),
    4747              :                      errdetail("If recovery is unpaused, the server will shut down."),
    4748              :                      errhint("You can then restart the server after making the necessary configuration changes.")));
    4749              : 
    4750            0 :             while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
    4751              :             {
    4752            0 :                 ProcessStartupProcInterrupts();
    4753              : 
    4754            0 :                 if (CheckForStandbyTrigger())
    4755              :                 {
    4756            0 :                     if (!warned_for_promote)
    4757            0 :                         ereport(WARNING,
    4758              :                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4759              :                                  errmsg("promotion is not possible because of insufficient parameter settings"),
    4760              : 
    4761              :                         /*
    4762              :                          * Repeat the detail from above so it's easy to find
    4763              :                          * in the log.
    4764              :                          */
    4765              :                                  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
    4766              :                                            param_name,
    4767              :                                            currValue,
    4768              :                                            minValue),
    4769              :                                  errhint("Restart the server after making the necessary configuration changes.")));
    4770            0 :                     warned_for_promote = true;
    4771              :                 }
    4772              : 
    4773              :                 /*
    4774              :                  * If recovery pause is requested then set it paused.  While
    4775              :                  * we are in the loop, user might resume and pause again so
    4776              :                  * set this every time.
    4777              :                  */
    4778            0 :                 ConfirmRecoveryPaused();
    4779              : 
    4780              :                 /*
    4781              :                  * We wait on a condition variable that will wake us as soon
    4782              :                  * as the pause ends, but we use a timeout so we can check the
    4783              :                  * above conditions periodically too.
    4784              :                  */
    4785            0 :                 ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
    4786              :                                             WAIT_EVENT_RECOVERY_PAUSE);
    4787              :             }
    4788            0 :             ConditionVariableCancelSleep();
    4789              :         }
    4790              : 
    4791            0 :         ereport(FATAL,
    4792              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4793              :                  errmsg("recovery aborted because of insufficient parameter settings"),
    4794              :         /* Repeat the detail from above so it's easy to find in the log. */
    4795              :                  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
    4796              :                            param_name,
    4797              :                            currValue,
    4798              :                            minValue),
    4799              :                  errhint("You can restart the server after making the necessary configuration changes.")));
    4800              :     }
    4801          685 : }
    4802              : 
    4803              : 
    4804              : /*
    4805              :  * GUC check_hook for primary_slot_name
    4806              :  */
    4807              : bool
    4808         1393 : check_primary_slot_name(char **newval, void **extra, GucSource source)
    4809              : {
    4810              :     int         err_code;
    4811         1393 :     char       *err_msg = NULL;
    4812         1393 :     char       *err_hint = NULL;
    4813              : 
    4814         1393 :     if (*newval && strcmp(*newval, "") != 0 &&
    4815          203 :         !ReplicationSlotValidateNameInternal(*newval, false, &err_code,
    4816              :                                              &err_msg, &err_hint))
    4817              :     {
    4818            0 :         GUC_check_errcode(err_code);
    4819            0 :         GUC_check_errdetail("%s", err_msg);
    4820            0 :         if (err_hint != NULL)
    4821            0 :             GUC_check_errhint("%s", err_hint);
    4822            0 :         return false;
    4823              :     }
    4824              : 
    4825         1393 :     return true;
    4826              : }
    4827              : 
    4828              : /*
    4829              :  * Recovery target settings: Only one of the several recovery_target* settings
    4830              :  * may be set.  Setting a second one results in an error.  The global variable
    4831              :  * recoveryTarget tracks which kind of recovery target was chosen.  Other
    4832              :  * variables store the actual target value (for example a string or a xid).
    4833              :  * The assign functions of the parameters check whether a competing parameter
    4834              :  * was already set.  But we want to allow setting the same parameter multiple
    4835              :  * times.  We also want to allow unsetting a parameter and setting a different
    4836              :  * one, so we unset recoveryTarget when the parameter is set to an empty
    4837              :  * string.
    4838              :  *
    4839              :  * XXX this code is broken by design.  Throwing an error from a GUC assign
    4840              :  * hook breaks fundamental assumptions of guc.c.  So long as all the variables
    4841              :  * for which this can happen are PGC_POSTMASTER, the consequences are limited,
    4842              :  * since we'd just abort postmaster startup anyway.  Nonetheless it's likely
    4843              :  * that we have odd behaviors such as unexpected GUC ordering dependencies.
    4844              :  */
    4845              : 
    4846              : pg_noreturn static void
    4847            1 : error_multiple_recovery_targets(void)
    4848              : {
    4849            1 :     ereport(ERROR,
    4850              :             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4851              :              errmsg("multiple recovery targets specified"),
    4852              :              errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
    4853              : }
    4854              : 
    4855              : /*
    4856              :  * GUC check_hook for recovery_target
    4857              :  */
    4858              : bool
    4859         1190 : check_recovery_target(char **newval, void **extra, GucSource source)
    4860              : {
    4861         1190 :     if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
    4862              :     {
    4863            0 :         GUC_check_errdetail("The only allowed value is \"immediate\".");
    4864            0 :         return false;
    4865              :     }
    4866         1190 :     return true;
    4867              : }
    4868              : 
    4869              : /*
    4870              :  * GUC assign_hook for recovery_target
    4871              :  */
    4872              : void
    4873         1190 : assign_recovery_target(const char *newval, void *extra)
    4874              : {
    4875         1190 :     if (recoveryTarget != RECOVERY_TARGET_UNSET &&
    4876            0 :         recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
    4877            0 :         error_multiple_recovery_targets();
    4878              : 
    4879         1190 :     if (newval && strcmp(newval, "") != 0)
    4880            1 :         recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
    4881              :     else
    4882         1189 :         recoveryTarget = RECOVERY_TARGET_UNSET;
    4883         1190 : }
    4884              : 
    4885              : /*
    4886              :  * GUC check_hook for recovery_target_lsn
    4887              :  */
    4888              : bool
    4889         1196 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
    4890              : {
    4891         1196 :     if (strcmp(*newval, "") != 0)
    4892              :     {
    4893              :         XLogRecPtr  lsn;
    4894              :         XLogRecPtr *myextra;
    4895            8 :         ErrorSaveContext escontext = {T_ErrorSaveContext};
    4896              : 
    4897            8 :         lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
    4898            8 :         if (escontext.error_occurred)
    4899            0 :             return false;
    4900              : 
    4901            8 :         myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
    4902            8 :         if (!myextra)
    4903            0 :             return false;
    4904            8 :         *myextra = lsn;
    4905            8 :         *extra = myextra;
    4906              :     }
    4907         1196 :     return true;
    4908              : }
    4909              : 
    4910              : /*
    4911              :  * GUC assign_hook for recovery_target_lsn
    4912              :  */
    4913              : void
    4914         1196 : assign_recovery_target_lsn(const char *newval, void *extra)
    4915              : {
    4916         1196 :     if (recoveryTarget != RECOVERY_TARGET_UNSET &&
    4917            0 :         recoveryTarget != RECOVERY_TARGET_LSN)
    4918            0 :         error_multiple_recovery_targets();
    4919              : 
    4920         1196 :     if (newval && strcmp(newval, "") != 0)
    4921              :     {
    4922            8 :         recoveryTarget = RECOVERY_TARGET_LSN;
    4923            8 :         recoveryTargetLSN = *((XLogRecPtr *) extra);
    4924              :     }
    4925              :     else
    4926         1188 :         recoveryTarget = RECOVERY_TARGET_UNSET;
    4927         1196 : }
    4928              : 
    4929              : /*
    4930              :  * GUC check_hook for recovery_target_name
    4931              :  */
    4932              : bool
    4933         1196 : check_recovery_target_name(char **newval, void **extra, GucSource source)
    4934              : {
    4935              :     /* Use the value of newval directly */
    4936         1196 :     if (strlen(*newval) >= MAXFNAMELEN)
    4937              :     {
    4938            0 :         GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
    4939              :                             "recovery_target_name", MAXFNAMELEN - 1);
    4940            0 :         return false;
    4941              :     }
    4942         1196 :     return true;
    4943              : }
    4944              : 
    4945              : /*
    4946              :  * GUC assign_hook for recovery_target_name
    4947              :  */
    4948              : void
    4949         1196 : assign_recovery_target_name(const char *newval, void *extra)
    4950              : {
    4951         1196 :     if (recoveryTarget != RECOVERY_TARGET_UNSET &&
    4952            0 :         recoveryTarget != RECOVERY_TARGET_NAME)
    4953            0 :         error_multiple_recovery_targets();
    4954              : 
    4955         1196 :     if (newval && strcmp(newval, "") != 0)
    4956              :     {
    4957            6 :         recoveryTarget = RECOVERY_TARGET_NAME;
    4958            6 :         recoveryTargetName = newval;
    4959              :     }
    4960              :     else
    4961         1190 :         recoveryTarget = RECOVERY_TARGET_UNSET;
    4962         1196 : }
    4963              : 
    4964              : /*
    4965              :  * GUC check_hook for recovery_target_time
    4966              :  *
    4967              :  * The interpretation of the recovery_target_time string can depend on the
    4968              :  * time zone setting, so we need to wait until after all GUC processing is
    4969              :  * done before we can do the final parsing of the string.  This check function
    4970              :  * only does a parsing pass to catch syntax errors, but we store the string
    4971              :  * and parse it again when we need to use it.
    4972              :  */
    4973              : bool
    4974         1192 : check_recovery_target_time(char **newval, void **extra, GucSource source)
    4975              : {
    4976         1192 :     if (strcmp(*newval, "") != 0)
    4977              :     {
    4978              :         /* reject some special values */
    4979            3 :         if (strcmp(*newval, "now") == 0 ||
    4980            3 :             strcmp(*newval, "today") == 0 ||
    4981            3 :             strcmp(*newval, "tomorrow") == 0 ||
    4982            3 :             strcmp(*newval, "yesterday") == 0)
    4983              :         {
    4984            0 :             return false;
    4985              :         }
    4986              : 
    4987              :         /*
    4988              :          * parse timestamp value (see also timestamptz_in())
    4989              :          */
    4990              :         {
    4991            3 :             char       *str = *newval;
    4992              :             fsec_t      fsec;
    4993              :             struct pg_tm tt,
    4994            3 :                        *tm = &tt;
    4995              :             int         tz;
    4996              :             int         dtype;
    4997              :             int         nf;
    4998              :             int         dterr;
    4999              :             char       *field[MAXDATEFIELDS];
    5000              :             int         ftype[MAXDATEFIELDS];
    5001              :             char        workbuf[MAXDATELEN + MAXDATEFIELDS];
    5002              :             DateTimeErrorExtra dtextra;
    5003              :             TimestampTz timestamp;
    5004              : 
    5005            3 :             dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
    5006              :                                   field, ftype, MAXDATEFIELDS, &nf);
    5007            3 :             if (dterr == 0)
    5008            3 :                 dterr = DecodeDateTime(field, ftype, nf,
    5009              :                                        &dtype, tm, &fsec, &tz, &dtextra);
    5010            3 :             if (dterr != 0)
    5011            0 :                 return false;
    5012            3 :             if (dtype != DTK_DATE)
    5013            0 :                 return false;
    5014              : 
    5015            3 :             if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
    5016              :             {
    5017            0 :                 GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
    5018            0 :                 return false;
    5019              :             }
    5020              :         }
    5021              :     }
    5022         1192 :     return true;
    5023              : }
    5024              : 
    5025              : /*
    5026              :  * GUC assign_hook for recovery_target_time
    5027              :  */
    5028              : void
    5029         1192 : assign_recovery_target_time(const char *newval, void *extra)
    5030              : {
    5031         1192 :     if (recoveryTarget != RECOVERY_TARGET_UNSET &&
    5032            1 :         recoveryTarget != RECOVERY_TARGET_TIME)
    5033            1 :         error_multiple_recovery_targets();
    5034              : 
    5035         1191 :     if (newval && strcmp(newval, "") != 0)
    5036            2 :         recoveryTarget = RECOVERY_TARGET_TIME;
    5037              :     else
    5038         1189 :         recoveryTarget = RECOVERY_TARGET_UNSET;
    5039         1191 : }
    5040              : 
    5041              : /*
    5042              :  * GUC check_hook for recovery_target_timeline
    5043              :  */
    5044              : bool
    5045         1193 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
    5046              : {
    5047              :     RecoveryTargetTimeLineGoal rttg;
    5048              :     RecoveryTargetTimeLineGoal *myextra;
    5049              : 
    5050         1193 :     if (strcmp(*newval, "current") == 0)
    5051            0 :         rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
    5052         1193 :     else if (strcmp(*newval, "latest") == 0)
    5053         1190 :         rttg = RECOVERY_TARGET_TIMELINE_LATEST;
    5054              :     else
    5055              :     {
    5056              :         char       *endp;
    5057              :         uint64      timeline;
    5058              : 
    5059            3 :         rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
    5060              : 
    5061            3 :         errno = 0;
    5062            3 :         timeline = strtou64(*newval, &endp, 0);
    5063              : 
    5064            3 :         if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
    5065              :         {
    5066            1 :             GUC_check_errdetail("\"%s\" is not a valid number.",
    5067              :                                 "recovery_target_timeline");
    5068            3 :             return false;
    5069              :         }
    5070              : 
    5071            2 :         if (timeline < 1 || timeline > PG_UINT32_MAX)
    5072              :         {
    5073            2 :             GUC_check_errdetail("\"%s\" must be between %u and %u.",
    5074              :                                 "recovery_target_timeline", 1, PG_UINT32_MAX);
    5075            2 :             return false;
    5076              :         }
    5077              :     }
    5078              : 
    5079         1190 :     myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal));
    5080         1190 :     if (!myextra)
    5081            0 :         return false;
    5082         1190 :     *myextra = rttg;
    5083         1190 :     *extra = myextra;
    5084              : 
    5085         1190 :     return true;
    5086              : }
    5087              : 
    5088              : /*
    5089              :  * GUC assign_hook for recovery_target_timeline
    5090              :  */
    5091              : void
    5092         1190 : assign_recovery_target_timeline(const char *newval, void *extra)
    5093              : {
    5094         1190 :     recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
    5095         1190 :     if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
    5096            0 :         recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
    5097              :     else
    5098         1190 :         recoveryTargetTLIRequested = 0;
    5099         1190 : }
    5100              : 
    5101              : /*
    5102              :  * GUC check_hook for recovery_target_xid
    5103              :  */
    5104              : bool
    5105         1190 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
    5106              : {
    5107         1190 :     if (strcmp(*newval, "") != 0)
    5108              :     {
    5109              :         TransactionId xid;
    5110              :         TransactionId *myextra;
    5111              : 
    5112            1 :         errno = 0;
    5113            1 :         xid = (TransactionId) strtou64(*newval, NULL, 0);
    5114            1 :         if (errno == EINVAL || errno == ERANGE)
    5115            0 :             return false;
    5116              : 
    5117            1 :         myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
    5118            1 :         if (!myextra)
    5119            0 :             return false;
    5120            1 :         *myextra = xid;
    5121            1 :         *extra = myextra;
    5122              :     }
    5123         1190 :     return true;
    5124              : }
    5125              : 
    5126              : /*
    5127              :  * GUC assign_hook for recovery_target_xid
    5128              :  */
    5129              : void
    5130         1190 : assign_recovery_target_xid(const char *newval, void *extra)
    5131              : {
    5132         1190 :     if (recoveryTarget != RECOVERY_TARGET_UNSET &&
    5133            0 :         recoveryTarget != RECOVERY_TARGET_XID)
    5134            0 :         error_multiple_recovery_targets();
    5135              : 
    5136         1190 :     if (newval && strcmp(newval, "") != 0)
    5137              :     {
    5138            1 :         recoveryTarget = RECOVERY_TARGET_XID;
    5139            1 :         recoveryTargetXid = *((TransactionId *) extra);
    5140              :     }
    5141              :     else
    5142         1189 :         recoveryTarget = RECOVERY_TARGET_UNSET;
    5143         1190 : }
        

Generated by: LCOV version 2.0-1