LCOV - code coverage report
Current view: top level - src/backend/access/transam - xlog.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 2765 3450 80.1 %
Date: 2020-05-25 05:06:35 Functions: 128 144 88.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * xlog.c
       4             :  *      PostgreSQL write-ahead log manager
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * src/backend/access/transam/xlog.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : 
      15             : #include "postgres.h"
      16             : 
      17             : #include <ctype.h>
      18             : #include <math.h>
      19             : #include <time.h>
      20             : #include <fcntl.h>
      21             : #include <sys/stat.h>
      22             : #include <sys/time.h>
      23             : #include <unistd.h>
      24             : 
      25             : #include "access/clog.h"
      26             : #include "access/commit_ts.h"
      27             : #include "access/heaptoast.h"
      28             : #include "access/multixact.h"
      29             : #include "access/rewriteheap.h"
      30             : #include "access/subtrans.h"
      31             : #include "access/timeline.h"
      32             : #include "access/transam.h"
      33             : #include "access/twophase.h"
      34             : #include "access/xact.h"
      35             : #include "access/xlog_internal.h"
      36             : #include "access/xlogarchive.h"
      37             : #include "access/xloginsert.h"
      38             : #include "access/xlogreader.h"
      39             : #include "access/xlogutils.h"
      40             : #include "catalog/catversion.h"
      41             : #include "catalog/pg_control.h"
      42             : #include "catalog/pg_database.h"
      43             : #include "commands/progress.h"
      44             : #include "commands/tablespace.h"
      45             : #include "common/controldata_utils.h"
      46             : #include "executor/instrument.h"
      47             : #include "miscadmin.h"
      48             : #include "pg_trace.h"
      49             : #include "pgstat.h"
      50             : #include "port/atomics.h"
      51             : #include "postmaster/bgwriter.h"
      52             : #include "postmaster/startup.h"
      53             : #include "postmaster/walwriter.h"
      54             : #include "replication/basebackup.h"
      55             : #include "replication/logical.h"
      56             : #include "replication/origin.h"
      57             : #include "replication/slot.h"
      58             : #include "replication/snapbuild.h"
      59             : #include "replication/walreceiver.h"
      60             : #include "replication/walsender.h"
      61             : #include "storage/bufmgr.h"
      62             : #include "storage/fd.h"
      63             : #include "storage/ipc.h"
      64             : #include "storage/large_object.h"
      65             : #include "storage/latch.h"
      66             : #include "storage/pmsignal.h"
      67             : #include "storage/predicate.h"
      68             : #include "storage/proc.h"
      69             : #include "storage/procarray.h"
      70             : #include "storage/reinit.h"
      71             : #include "storage/smgr.h"
      72             : #include "storage/spin.h"
      73             : #include "storage/sync.h"
      74             : #include "utils/builtins.h"
      75             : #include "utils/guc.h"
      76             : #include "utils/memutils.h"
      77             : #include "utils/ps_status.h"
      78             : #include "utils/relmapper.h"
      79             : #include "utils/snapmgr.h"
      80             : #include "utils/timestamp.h"
      81             : 
      82             : extern uint32 bootstrap_data_checksum_version;
      83             : 
      84             : /* Unsupported old recovery command file names (relative to $PGDATA) */
      85             : #define RECOVERY_COMMAND_FILE   "recovery.conf"
      86             : #define RECOVERY_COMMAND_DONE   "recovery.done"
      87             : 
      88             : /* User-settable parameters */
      89             : int         max_wal_size_mb = 1024; /* 1 GB */
      90             : int         min_wal_size_mb = 80;   /* 80 MB */
      91             : int         wal_keep_segments = 0;
      92             : int         XLOGbuffers = -1;
      93             : int         XLogArchiveTimeout = 0;
      94             : int         XLogArchiveMode = ARCHIVE_MODE_OFF;
      95             : char       *XLogArchiveCommand = NULL;
      96             : bool        EnableHotStandby = false;
      97             : bool        fullPageWrites = true;
      98             : bool        wal_log_hints = false;
      99             : bool        wal_compression = false;
     100             : char       *wal_consistency_checking_string = NULL;
     101             : bool       *wal_consistency_checking = NULL;
     102             : bool        wal_init_zero = true;
     103             : bool        wal_recycle = true;
     104             : bool        log_checkpoints = false;
     105             : int         sync_method = DEFAULT_SYNC_METHOD;
     106             : int         wal_level = WAL_LEVEL_MINIMAL;
     107             : int         CommitDelay = 0;    /* precommit delay in microseconds */
     108             : int         CommitSiblings = 5; /* # concurrent xacts needed to sleep */
     109             : int         wal_retrieve_retry_interval = 5000;
     110             : int         max_slot_wal_keep_size_mb = -1;
     111             : 
     112             : #ifdef WAL_DEBUG
     113             : bool        XLOG_DEBUG = false;
     114             : #endif
     115             : 
     116             : int         wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
     117             : 
     118             : /*
     119             :  * Number of WAL insertion locks to use. A higher value allows more insertions
     120             :  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
     121             :  * which needs to iterate all the locks.
     122             :  */
     123             : #define NUM_XLOGINSERT_LOCKS  8
     124             : 
     125             : /*
     126             :  * Max distance from last checkpoint, before triggering a new xlog-based
     127             :  * checkpoint.
     128             :  */
     129             : int         CheckPointSegments;
     130             : 
     131             : /* Estimated distance between checkpoints, in bytes */
     132             : static double CheckPointDistanceEstimate = 0;
     133             : static double PrevCheckPointDistance = 0;
     134             : 
     135             : /*
     136             :  * GUC support
     137             :  */
     138             : const struct config_enum_entry sync_method_options[] = {
     139             :     {"fsync", SYNC_METHOD_FSYNC, false},
     140             : #ifdef HAVE_FSYNC_WRITETHROUGH
     141             :     {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
     142             : #endif
     143             : #ifdef HAVE_FDATASYNC
     144             :     {"fdatasync", SYNC_METHOD_FDATASYNC, false},
     145             : #endif
     146             : #ifdef OPEN_SYNC_FLAG
     147             :     {"open_sync", SYNC_METHOD_OPEN, false},
     148             : #endif
     149             : #ifdef OPEN_DATASYNC_FLAG
     150             :     {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
     151             : #endif
     152             :     {NULL, 0, false}
     153             : };
     154             : 
     155             : 
     156             : /*
     157             :  * Although only "on", "off", and "always" are documented,
     158             :  * we accept all the likely variants of "on" and "off".
     159             :  */
     160             : const struct config_enum_entry archive_mode_options[] = {
     161             :     {"always", ARCHIVE_MODE_ALWAYS, false},
     162             :     {"on", ARCHIVE_MODE_ON, false},
     163             :     {"off", ARCHIVE_MODE_OFF, false},
     164             :     {"true", ARCHIVE_MODE_ON, true},
     165             :     {"false", ARCHIVE_MODE_OFF, true},
     166             :     {"yes", ARCHIVE_MODE_ON, true},
     167             :     {"no", ARCHIVE_MODE_OFF, true},
     168             :     {"1", ARCHIVE_MODE_ON, true},
     169             :     {"0", ARCHIVE_MODE_OFF, true},
     170             :     {NULL, 0, false}
     171             : };
     172             : 
     173             : const struct config_enum_entry recovery_target_action_options[] = {
     174             :     {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
     175             :     {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
     176             :     {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
     177             :     {NULL, 0, false}
     178             : };
     179             : 
     180             : /*
     181             :  * Statistics for current checkpoint are collected in this global struct.
     182             :  * Because only the checkpointer or a stand-alone backend can perform
     183             :  * checkpoints, this will be unused in normal backends.
     184             :  */
     185             : CheckpointStatsData CheckpointStats;
     186             : 
     187             : /*
     188             :  * ThisTimeLineID will be same in all backends --- it identifies current
     189             :  * WAL timeline for the database system.
     190             :  */
     191             : TimeLineID  ThisTimeLineID = 0;
     192             : 
     193             : /*
     194             :  * Are we doing recovery from XLOG?
     195             :  *
     196             :  * This is only ever true in the startup process; it should be read as meaning
     197             :  * "this process is replaying WAL records", rather than "the system is in
     198             :  * recovery mode".  It should be examined primarily by functions that need
     199             :  * to act differently when called from a WAL redo function (e.g., to skip WAL
     200             :  * logging).  To check whether the system is in recovery regardless of which
     201             :  * process you're running in, use RecoveryInProgress() but only after shared
     202             :  * memory startup and lock initialization.
     203             :  */
     204             : bool        InRecovery = false;
     205             : 
     206             : /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
     207             : HotStandbyState standbyState = STANDBY_DISABLED;
     208             : 
     209             : static XLogRecPtr LastRec;
     210             : 
     211             : /* Local copy of WalRcv->flushedUpto */
     212             : static XLogRecPtr flushedUpto = 0;
     213             : static TimeLineID receiveTLI = 0;
     214             : 
     215             : /*
     216             :  * During recovery, lastFullPageWrites keeps track of full_page_writes that
     217             :  * the replayed WAL records indicate. It's initialized with full_page_writes
     218             :  * that the recovery starting checkpoint record indicates, and then updated
     219             :  * each time XLOG_FPW_CHANGE record is replayed.
     220             :  */
     221             : static bool lastFullPageWrites;
     222             : 
     223             : /*
     224             :  * Local copy of the state tracked by SharedRecoveryState in shared memory,
     225             :  * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
     226             :  * means "not known, need to check the shared state".
     227             :  */
     228             : static bool LocalRecoveryInProgress = true;
     229             : 
     230             : /*
     231             :  * Local copy of SharedHotStandbyActive variable. False actually means "not
     232             :  * known, need to check the shared state".
     233             :  */
     234             : static bool LocalHotStandbyActive = false;
     235             : 
     236             : /*
     237             :  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
     238             :  * known, need to check the shared state".
     239             :  */
     240             : static bool LocalPromoteIsTriggered = false;
     241             : 
     242             : /*
     243             :  * Local state for XLogInsertAllowed():
     244             :  *      1: unconditionally allowed to insert XLOG
     245             :  *      0: unconditionally not allowed to insert XLOG
     246             :  *      -1: must check RecoveryInProgress(); disallow until it is false
     247             :  * Most processes start with -1 and transition to 1 after seeing that recovery
     248             :  * is not in progress.  But we can also force the value for special cases.
     249             :  * The coding in XLogInsertAllowed() depends on the first two of these states
     250             :  * being numerically the same as bool true and false.
     251             :  */
     252             : static int  LocalXLogInsertAllowed = -1;
     253             : 
     254             : /*
     255             :  * When ArchiveRecoveryRequested is set, archive recovery was requested,
     256             :  * ie. signal files were present. When InArchiveRecovery is set, we are
     257             :  * currently recovering using offline XLOG archives. These variables are only
     258             :  * valid in the startup process.
     259             :  *
     260             :  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
     261             :  * currently performing crash recovery using only XLOG files in pg_wal, but
     262             :  * will switch to using offline XLOG archives as soon as we reach the end of
     263             :  * WAL in pg_wal.
     264             : */
     265             : bool        ArchiveRecoveryRequested = false;
     266             : bool        InArchiveRecovery = false;
     267             : 
     268             : static bool standby_signal_file_found = false;
     269             : static bool recovery_signal_file_found = false;
     270             : 
     271             : /* Was the last xlog file restored from archive, or local? */
     272             : static bool restoredFromArchive = false;
     273             : 
     274             : /* Buffers dedicated to consistency checks of size BLCKSZ */
     275             : static char *replay_image_masked = NULL;
     276             : static char *master_image_masked = NULL;
     277             : 
     278             : /* options formerly taken from recovery.conf for archive recovery */
     279             : char       *recoveryRestoreCommand = NULL;
     280             : char       *recoveryEndCommand = NULL;
     281             : char       *archiveCleanupCommand = NULL;
     282             : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
     283             : bool        recoveryTargetInclusive = true;
     284             : int         recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
     285             : TransactionId recoveryTargetXid;
     286             : char       *recovery_target_time_string;
     287             : static TimestampTz recoveryTargetTime;
     288             : const char *recoveryTargetName;
     289             : XLogRecPtr  recoveryTargetLSN;
     290             : int         recovery_min_apply_delay = 0;
     291             : 
     292             : /* options formerly taken from recovery.conf for XLOG streaming */
     293             : bool        StandbyModeRequested = false;
     294             : char       *PrimaryConnInfo = NULL;
     295             : char       *PrimarySlotName = NULL;
     296             : char       *PromoteTriggerFile = NULL;
     297             : bool        wal_receiver_create_temp_slot = false;
     298             : 
     299             : /* are we currently in standby mode? */
     300             : bool        StandbyMode = false;
     301             : 
     302             : /* whether request for fast promotion has been made yet */
     303             : static bool fast_promote = false;
     304             : 
     305             : /*
     306             :  * if recoveryStopsBefore/After returns true, it saves information of the stop
     307             :  * point here
     308             :  */
     309             : static TransactionId recoveryStopXid;
     310             : static TimestampTz recoveryStopTime;
     311             : static XLogRecPtr recoveryStopLSN;
     312             : static char recoveryStopName[MAXFNAMELEN];
     313             : static bool recoveryStopAfter;
     314             : 
     315             : /*
     316             :  * During normal operation, the only timeline we care about is ThisTimeLineID.
     317             :  * During recovery, however, things are more complicated.  To simplify life
     318             :  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
     319             :  * scan through the WAL history (that is, it is the line that was active when
     320             :  * the currently-scanned WAL record was generated).  We also need these
     321             :  * timeline values:
     322             :  *
     323             :  * recoveryTargetTimeLineGoal: what the user requested, if any
     324             :  *
     325             :  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
     326             :  *
     327             :  * recoveryTargetTLI: the currently understood target timeline; changes
     328             :  *
     329             :  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
     330             :  * its known parents, newest first (so recoveryTargetTLI is always the
     331             :  * first list member).  Only these TLIs are expected to be seen in the WAL
     332             :  * segments we read, and indeed only these TLIs will be considered as
     333             :  * candidate WAL files to open at all.
     334             :  *
     335             :  * curFileTLI: the TLI appearing in the name of the current input WAL file.
     336             :  * (This is not necessarily the same as ThisTimeLineID, because we could
     337             :  * be scanning data that was copied from an ancestor timeline when the current
     338             :  * file was created.)  During a sequential scan we do not allow this value
     339             :  * to decrease.
     340             :  */
     341             : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
     342             : TimeLineID  recoveryTargetTLIRequested = 0;
     343             : TimeLineID  recoveryTargetTLI = 0;
     344             : static List *expectedTLEs;
     345             : static TimeLineID curFileTLI;
     346             : 
     347             : /*
     348             :  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
     349             :  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
     350             :  * end+1 of the last record, and is reset when we end a top-level transaction,
     351             :  * or start a new one; so it can be used to tell if the current transaction has
     352             :  * created any XLOG records.
     353             :  *
     354             :  * While in parallel mode, this may not be fully up to date.  When committing,
     355             :  * a transaction can assume this covers all xlog records written either by the
     356             :  * user backend or by any parallel worker which was present at any point during
     357             :  * the transaction.  But when aborting, or when still in parallel mode, other
     358             :  * parallel backends may have written WAL records at later LSNs than the value
     359             :  * stored here.  The parallel leader advances its own copy, when necessary,
     360             :  * in WaitForParallelWorkersToFinish.
     361             :  */
     362             : XLogRecPtr  ProcLastRecPtr = InvalidXLogRecPtr;
     363             : XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
     364             : XLogRecPtr  XactLastCommitEnd = InvalidXLogRecPtr;
     365             : 
     366             : /*
     367             :  * RedoRecPtr is this backend's local copy of the REDO record pointer
     368             :  * (which is almost but not quite the same as a pointer to the most recent
     369             :  * CHECKPOINT record).  We update this from the shared-memory copy,
     370             :  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
     371             :  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
     372             :  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
     373             :  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
     374             :  * InitXLOGAccess.
     375             :  */
     376             : static XLogRecPtr RedoRecPtr;
     377             : 
     378             : /*
     379             :  * doPageWrites is this backend's local copy of (forcePageWrites ||
     380             :  * fullPageWrites).  It is used together with RedoRecPtr to decide whether
     381             :  * a full-page image of a page need to be taken.
     382             :  */
     383             : static bool doPageWrites;
     384             : 
     385             : /* Has the recovery code requested a walreceiver wakeup? */
     386             : static bool doRequestWalReceiverReply;
     387             : 
     388             : /*
     389             :  * RedoStartLSN points to the checkpoint's REDO location which is specified
     390             :  * in a backup label file, backup history file or control file. In standby
     391             :  * mode, XLOG streaming usually starts from the position where an invalid
     392             :  * record was found. But if we fail to read even the initial checkpoint
     393             :  * record, we use the REDO location instead of the checkpoint location as
     394             :  * the start position of XLOG streaming. Otherwise we would have to jump
     395             :  * backwards to the REDO location after reading the checkpoint record,
     396             :  * because the REDO record can precede the checkpoint record.
     397             :  */
     398             : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
     399             : 
     400             : /*----------
     401             :  * Shared-memory data structures for XLOG control
     402             :  *
     403             :  * LogwrtRqst indicates a byte position that we need to write and/or fsync
     404             :  * the log up to (all records before that point must be written or fsynced).
     405             :  * LogwrtResult indicates the byte positions we have already written/fsynced.
     406             :  * These structs are identical but are declared separately to indicate their
     407             :  * slightly different functions.
     408             :  *
     409             :  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
     410             :  * WALWriteLock.  To update it, you need to hold both locks.  The point of
     411             :  * this arrangement is that the value can be examined by code that already
     412             :  * holds WALWriteLock without needing to grab info_lck as well.  In addition
     413             :  * to the shared variable, each backend has a private copy of LogwrtResult,
     414             :  * which is updated when convenient.
     415             :  *
     416             :  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
     417             :  * (protected by info_lck), but we don't need to cache any copies of it.
     418             :  *
     419             :  * info_lck is only held long enough to read/update the protected variables,
     420             :  * so it's a plain spinlock.  The other locks are held longer (potentially
     421             :  * over I/O operations), so we use LWLocks for them.  These locks are:
     422             :  *
     423             :  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
     424             :  * It is only held while initializing and changing the mapping.  If the
     425             :  * contents of the buffer being replaced haven't been written yet, the mapping
     426             :  * lock is released while the write is done, and reacquired afterwards.
     427             :  *
     428             :  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
     429             :  * XLogFlush).
     430             :  *
     431             :  * ControlFileLock: must be held to read/update control file or create
     432             :  * new log file.
     433             :  *
     434             :  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
     435             :  * only one checkpointer at a time; currently, with all checkpoints done by
     436             :  * the checkpointer, this is just pro forma).
     437             :  *
     438             :  *----------
     439             :  */
     440             : 
     441             : typedef struct XLogwrtRqst
     442             : {
     443             :     XLogRecPtr  Write;          /* last byte + 1 to write out */
     444             :     XLogRecPtr  Flush;          /* last byte + 1 to flush */
     445             : } XLogwrtRqst;
     446             : 
     447             : typedef struct XLogwrtResult
     448             : {
     449             :     XLogRecPtr  Write;          /* last byte + 1 written out */
     450             :     XLogRecPtr  Flush;          /* last byte + 1 flushed */
     451             : } XLogwrtResult;
     452             : 
     453             : /*
     454             :  * Inserting to WAL is protected by a small fixed number of WAL insertion
     455             :  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
     456             :  * matter which one. To lock out other concurrent insertions, you must hold
     457             :  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
     458             :  * indicator of how far the insertion has progressed (insertingAt).
     459             :  *
     460             :  * The insertingAt values are read when a process wants to flush WAL from
     461             :  * the in-memory buffers to disk, to check that all the insertions to the
     462             :  * region the process is about to write out have finished. You could simply
     463             :  * wait for all currently in-progress insertions to finish, but the
     464             :  * insertingAt indicator allows you to ignore insertions to later in the WAL,
     465             :  * so that you only wait for the insertions that are modifying the buffers
     466             :  * you're about to write out.
     467             :  *
     468             :  * This isn't just an optimization. If all the WAL buffers are dirty, an
     469             :  * inserter that's holding a WAL insert lock might need to evict an old WAL
     470             :  * buffer, which requires flushing the WAL. If it's possible for an inserter
     471             :  * to block on another inserter unnecessarily, deadlock can arise when two
     472             :  * inserters holding a WAL insert lock wait for each other to finish their
     473             :  * insertion.
     474             :  *
     475             :  * Small WAL records that don't cross a page boundary never update the value,
     476             :  * the WAL record is just copied to the page and the lock is released. But
     477             :  * to avoid the deadlock-scenario explained above, the indicator is always
     478             :  * updated before sleeping while holding an insertion lock.
     479             :  *
     480             :  * lastImportantAt contains the LSN of the last important WAL record inserted
     481             :  * using a given lock. This value is used to detect if there has been
     482             :  * important WAL activity since the last time some action, like a checkpoint,
     483             :  * was performed - allowing to not repeat the action if not. The LSN is
     484             :  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
     485             :  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
     486             :  * records.  Tracking the WAL activity directly in WALInsertLock has the
     487             :  * advantage of not needing any additional locks to update the value.
     488             :  */
     489             : typedef struct
     490             : {
     491             :     LWLock      lock;
     492             :     XLogRecPtr  insertingAt;
     493             :     XLogRecPtr  lastImportantAt;
     494             : } WALInsertLock;
     495             : 
     496             : /*
     497             :  * All the WAL insertion locks are allocated as an array in shared memory. We
     498             :  * force the array stride to be a power of 2, which saves a few cycles in
     499             :  * indexing, but more importantly also ensures that individual slots don't
     500             :  * cross cache line boundaries. (Of course, we have to also ensure that the
     501             :  * array start address is suitably aligned.)
     502             :  */
     503             : typedef union WALInsertLockPadded
     504             : {
     505             :     WALInsertLock l;
     506             :     char        pad[PG_CACHE_LINE_SIZE];
     507             : } WALInsertLockPadded;
     508             : 
     509             : /*
     510             :  * State of an exclusive backup, necessary to control concurrent activities
     511             :  * across sessions when working on exclusive backups.
     512             :  *
     513             :  * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
     514             :  * running, to be more precise pg_start_backup() is not being executed for
     515             :  * an exclusive backup and there is no exclusive backup in progress.
     516             :  * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
     517             :  * exclusive backup.
     518             :  * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
     519             :  * running and an exclusive backup is in progress. pg_stop_backup() is
     520             :  * needed to finish it.
     521             :  * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
     522             :  * exclusive backup.
     523             :  */
     524             : typedef enum ExclusiveBackupState
     525             : {
     526             :     EXCLUSIVE_BACKUP_NONE = 0,
     527             :     EXCLUSIVE_BACKUP_STARTING,
     528             :     EXCLUSIVE_BACKUP_IN_PROGRESS,
     529             :     EXCLUSIVE_BACKUP_STOPPING
     530             : } ExclusiveBackupState;
     531             : 
     532             : /*
     533             :  * Session status of running backup, used for sanity checks in SQL-callable
     534             :  * functions to start and stop backups.
     535             :  */
     536             : static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
     537             : 
     538             : /*
     539             :  * Shared state data for WAL insertion.
     540             :  */
     541             : typedef struct XLogCtlInsert
     542             : {
     543             :     slock_t     insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
     544             : 
     545             :     /*
     546             :      * CurrBytePos is the end of reserved WAL. The next record will be
     547             :      * inserted at that position. PrevBytePos is the start position of the
     548             :      * previously inserted (or rather, reserved) record - it is copied to the
     549             :      * prev-link of the next record. These are stored as "usable byte
     550             :      * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
     551             :      */
     552             :     uint64      CurrBytePos;
     553             :     uint64      PrevBytePos;
     554             : 
     555             :     /*
     556             :      * Make sure the above heavily-contended spinlock and byte positions are
     557             :      * on their own cache line. In particular, the RedoRecPtr and full page
     558             :      * write variables below should be on a different cache line. They are
     559             :      * read on every WAL insertion, but updated rarely, and we don't want
     560             :      * those reads to steal the cache line containing Curr/PrevBytePos.
     561             :      */
     562             :     char        pad[PG_CACHE_LINE_SIZE];
     563             : 
     564             :     /*
     565             :      * fullPageWrites is the master copy used by all backends to determine
     566             :      * whether to write full-page to WAL, instead of using process-local one.
     567             :      * This is required because, when full_page_writes is changed by SIGHUP,
     568             :      * we must WAL-log it before it actually affects WAL-logging by backends.
     569             :      * Checkpointer sets at startup or after SIGHUP.
     570             :      *
     571             :      * To read these fields, you must hold an insertion lock. To modify them,
     572             :      * you must hold ALL the locks.
     573             :      */
     574             :     XLogRecPtr  RedoRecPtr;     /* current redo point for insertions */
     575             :     bool        forcePageWrites;    /* forcing full-page writes for PITR? */
     576             :     bool        fullPageWrites;
     577             : 
     578             :     /*
     579             :      * exclusiveBackupState indicates the state of an exclusive backup (see
     580             :      * comments of ExclusiveBackupState for more details). nonExclusiveBackups
     581             :      * is a counter indicating the number of streaming base backups currently
     582             :      * in progress. forcePageWrites is set to true when either of these is
     583             :      * non-zero. lastBackupStart is the latest checkpoint redo location used
     584             :      * as a starting point for an online backup.
     585             :      */
     586             :     ExclusiveBackupState exclusiveBackupState;
     587             :     int         nonExclusiveBackups;
     588             :     XLogRecPtr  lastBackupStart;
     589             : 
     590             :     /*
     591             :      * WAL insertion locks.
     592             :      */
     593             :     WALInsertLockPadded *WALInsertLocks;
     594             : } XLogCtlInsert;
     595             : 
     596             : /*
     597             :  * Total shared-memory state for XLOG.
     598             :  */
     599             : typedef struct XLogCtlData
     600             : {
     601             :     XLogCtlInsert Insert;
     602             : 
     603             :     /* Protected by info_lck: */
     604             :     XLogwrtRqst LogwrtRqst;
     605             :     XLogRecPtr  RedoRecPtr;     /* a recent copy of Insert->RedoRecPtr */
     606             :     FullTransactionId ckptFullXid;  /* nextFullXid of latest checkpoint */
     607             :     XLogRecPtr  asyncXactLSN;   /* LSN of newest async commit/abort */
     608             :     XLogRecPtr  replicationSlotMinLSN;  /* oldest LSN needed by any slot */
     609             : 
     610             :     XLogSegNo   lastRemovedSegNo;   /* latest removed/recycled XLOG segment */
     611             : 
     612             :     /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
     613             :     XLogRecPtr  unloggedLSN;
     614             :     slock_t     ulsn_lck;
     615             : 
     616             :     /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
     617             :     pg_time_t   lastSegSwitchTime;
     618             :     XLogRecPtr  lastSegSwitchLSN;
     619             : 
     620             :     /*
     621             :      * Protected by info_lck and WALWriteLock (you must hold either lock to
     622             :      * read it, but both to update)
     623             :      */
     624             :     XLogwrtResult LogwrtResult;
     625             : 
     626             :     /*
     627             :      * Latest initialized page in the cache (last byte position + 1).
     628             :      *
     629             :      * To change the identity of a buffer (and InitializedUpTo), you need to
     630             :      * hold WALBufMappingLock.  To change the identity of a buffer that's
     631             :      * still dirty, the old page needs to be written out first, and for that
     632             :      * you need WALWriteLock, and you need to ensure that there are no
     633             :      * in-progress insertions to the page by calling
     634             :      * WaitXLogInsertionsToFinish().
     635             :      */
     636             :     XLogRecPtr  InitializedUpTo;
     637             : 
     638             :     /*
     639             :      * These values do not change after startup, although the pointed-to pages
     640             :      * and xlblocks values certainly do.  xlblocks values are protected by
     641             :      * WALBufMappingLock.
     642             :      */
     643             :     char       *pages;          /* buffers for unwritten XLOG pages */
     644             :     XLogRecPtr *xlblocks;       /* 1st byte ptr-s + XLOG_BLCKSZ */
     645             :     int         XLogCacheBlck;  /* highest allocated xlog buffer index */
     646             : 
     647             :     /*
     648             :      * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
     649             :      * If we created a new timeline when the system was started up,
     650             :      * PrevTimeLineID is the old timeline's ID that we forked off from.
     651             :      * Otherwise it's equal to ThisTimeLineID.
     652             :      */
     653             :     TimeLineID  ThisTimeLineID;
     654             :     TimeLineID  PrevTimeLineID;
     655             : 
     656             :     /*
     657             :      * SharedRecoveryState indicates if we're still in crash or archive
     658             :      * recovery.  Protected by info_lck.
     659             :      */
     660             :     RecoveryState SharedRecoveryState;
     661             : 
     662             :     /*
     663             :      * SharedHotStandbyActive indicates if we allow hot standby queries to be
     664             :      * run.  Protected by info_lck.
     665             :      */
     666             :     bool        SharedHotStandbyActive;
     667             : 
     668             :     /*
     669             :      * SharedPromoteIsTriggered indicates if a standby promotion has been
     670             :      * triggered.  Protected by info_lck.
     671             :      */
     672             :     bool        SharedPromoteIsTriggered;
     673             : 
     674             :     /*
     675             :      * WalWriterSleeping indicates whether the WAL writer is currently in
     676             :      * low-power mode (and hence should be nudged if an async commit occurs).
     677             :      * Protected by info_lck.
     678             :      */
     679             :     bool        WalWriterSleeping;
     680             : 
     681             :     /*
     682             :      * recoveryWakeupLatch is used to wake up the startup process to continue
     683             :      * WAL replay, if it is waiting for WAL to arrive or failover trigger file
     684             :      * to appear.
     685             :      */
     686             :     Latch       recoveryWakeupLatch;
     687             : 
     688             :     /*
     689             :      * During recovery, we keep a copy of the latest checkpoint record here.
     690             :      * lastCheckPointRecPtr points to start of checkpoint record and
     691             :      * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
     692             :      * checkpointer when it wants to create a restartpoint.
     693             :      *
     694             :      * Protected by info_lck.
     695             :      */
     696             :     XLogRecPtr  lastCheckPointRecPtr;
     697             :     XLogRecPtr  lastCheckPointEndPtr;
     698             :     CheckPoint  lastCheckPoint;
     699             : 
     700             :     /*
     701             :      * lastReplayedEndRecPtr points to end+1 of the last record successfully
     702             :      * replayed. When we're currently replaying a record, ie. in a redo
     703             :      * function, replayEndRecPtr points to the end+1 of the record being
     704             :      * replayed, otherwise it's equal to lastReplayedEndRecPtr.
     705             :      */
     706             :     XLogRecPtr  lastReplayedEndRecPtr;
     707             :     TimeLineID  lastReplayedTLI;
     708             :     XLogRecPtr  replayEndRecPtr;
     709             :     TimeLineID  replayEndTLI;
     710             :     /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
     711             :     TimestampTz recoveryLastXTime;
     712             : 
     713             :     /*
     714             :      * timestamp of when we started replaying the current chunk of WAL data,
     715             :      * only relevant for replication or archive recovery
     716             :      */
     717             :     TimestampTz currentChunkStartTime;
     718             :     /* Are we requested to pause recovery? */
     719             :     bool        recoveryPause;
     720             : 
     721             :     /*
     722             :      * lastFpwDisableRecPtr points to the start of the last replayed
     723             :      * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
     724             :      */
     725             :     XLogRecPtr  lastFpwDisableRecPtr;
     726             : 
     727             :     slock_t     info_lck;       /* locks shared variables shown above */
     728             : } XLogCtlData;
     729             : 
     730             : static XLogCtlData *XLogCtl = NULL;
     731             : 
     732             : /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
     733             : static WALInsertLockPadded *WALInsertLocks = NULL;
     734             : 
     735             : /*
     736             :  * We maintain an image of pg_control in shared memory.
     737             :  */
     738             : static ControlFileData *ControlFile = NULL;
     739             : 
     740             : /*
     741             :  * Calculate the amount of space left on the page after 'endptr'. Beware
     742             :  * multiple evaluation!
     743             :  */
     744             : #define INSERT_FREESPACE(endptr)    \
     745             :     (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
     746             : 
     747             : /* Macro to advance to next buffer index. */
     748             : #define NextBufIdx(idx)     \
     749             :         (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
     750             : 
     751             : /*
     752             :  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
     753             :  * would hold if it was in cache, the page containing 'recptr'.
     754             :  */
     755             : #define XLogRecPtrToBufIdx(recptr)  \
     756             :     (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
     757             : 
     758             : /*
     759             :  * These are the number of bytes in a WAL page usable for WAL data.
     760             :  */
     761             : #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
     762             : 
     763             : /* Convert values of GUCs measured in megabytes to equiv. segment count */
     764             : #define ConvertToXSegs(x, segsize)  \
     765             :     (x / ((segsize) / (1024 * 1024)))
     766             : 
     767             : /* The number of bytes in a WAL segment usable for WAL data. */
     768             : static int  UsableBytesInSegment;
     769             : 
     770             : /*
     771             :  * Private, possibly out-of-date copy of shared LogwrtResult.
     772             :  * See discussion above.
     773             :  */
     774             : static XLogwrtResult LogwrtResult = {0, 0};
     775             : 
     776             : /*
     777             :  * Codes indicating where we got a WAL file from during recovery, or where
     778             :  * to attempt to get one.
     779             :  */
     780             : typedef enum
     781             : {
     782             :     XLOG_FROM_ANY = 0,          /* request to read WAL from any source */
     783             :     XLOG_FROM_ARCHIVE,          /* restored using restore_command */
     784             :     XLOG_FROM_PG_WAL,           /* existing file in pg_wal */
     785             :     XLOG_FROM_STREAM            /* streamed from master */
     786             : } XLogSource;
     787             : 
     788             : /* human-readable names for XLogSources, for debugging output */
     789             : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
     790             : 
     791             : /*
     792             :  * openLogFile is -1 or a kernel FD for an open log file segment.
     793             :  * openLogSegNo identifies the segment.  These variables are only used to
     794             :  * write the XLOG, and so will normally refer to the active segment.
     795             :  * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
     796             :  */
     797             : static int  openLogFile = -1;
     798             : static XLogSegNo openLogSegNo = 0;
     799             : 
     800             : /*
     801             :  * These variables are used similarly to the ones above, but for reading
     802             :  * the XLOG.  Note, however, that readOff generally represents the offset
     803             :  * of the page just read, not the seek position of the FD itself, which
     804             :  * will be just past that page. readLen indicates how much of the current
     805             :  * page has been read into readBuf, and readSource indicates where we got
     806             :  * the currently open file from.
     807             :  * Note: we could use Reserve/ReleaseExternalFD to track consumption of
     808             :  * this FD too; but it doesn't currently seem worthwhile, since the XLOG is
     809             :  * not read by general-purpose sessions.
     810             :  */
     811             : static int  readFile = -1;
     812             : static XLogSegNo readSegNo = 0;
     813             : static uint32 readOff = 0;
     814             : static uint32 readLen = 0;
     815             : static XLogSource readSource = XLOG_FROM_ANY;
     816             : 
     817             : /*
     818             :  * Keeps track of which source we're currently reading from. This is
     819             :  * different from readSource in that this is always set, even when we don't
     820             :  * currently have a WAL file open. If lastSourceFailed is set, our last
     821             :  * attempt to read from currentSource failed, and we should try another source
     822             :  * next.
     823             :  *
     824             :  * pendingWalRcvRestart is set when a config change occurs that requires a
     825             :  * walreceiver restart.  This is only valid in XLOG_FROM_STREAM state.
     826             :  */
     827             : static XLogSource currentSource = XLOG_FROM_ANY;
     828             : static bool lastSourceFailed = false;
     829             : static bool pendingWalRcvRestart = false;
     830             : 
     831             : typedef struct XLogPageReadPrivate
     832             : {
     833             :     int         emode;
     834             :     bool        fetching_ckpt;  /* are we fetching a checkpoint record? */
     835             :     bool        randAccess;
     836             : } XLogPageReadPrivate;
     837             : 
     838             : /*
     839             :  * These variables track when we last obtained some WAL data to process,
     840             :  * and where we got it from.  (XLogReceiptSource is initially the same as
     841             :  * readSource, but readSource gets reset to zero when we don't have data
     842             :  * to process right now.  It is also different from currentSource, which
     843             :  * also changes when we try to read from a source and fail, while
     844             :  * XLogReceiptSource tracks where we last successfully read some WAL.)
     845             :  */
     846             : static TimestampTz XLogReceiptTime = 0;
     847             : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
     848             : 
     849             : /* State information for XLOG reading */
     850             : static XLogRecPtr ReadRecPtr;   /* start of last record read */
     851             : static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
     852             : 
     853             : /*
     854             :  * Local copies of equivalent fields in the control file.  When running
     855             :  * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
     856             :  * expect to replay all the WAL available, and updateMinRecoveryPoint is
     857             :  * switched to false to prevent any updates while replaying records.
     858             :  * Those values are kept consistent as long as crash recovery runs.
     859             :  */
     860             : static XLogRecPtr minRecoveryPoint;
     861             : static TimeLineID minRecoveryPointTLI;
     862             : static bool updateMinRecoveryPoint = true;
     863             : 
     864             : /*
     865             :  * Have we reached a consistent database state? In crash recovery, we have
     866             :  * to replay all the WAL, so reachedConsistency is never set. During archive
     867             :  * recovery, the database is consistent once minRecoveryPoint is reached.
     868             :  */
     869             : bool        reachedConsistency = false;
     870             : 
     871             : static bool InRedo = false;
     872             : 
     873             : /* Have we launched bgwriter during recovery? */
     874             : static bool bgwriterLaunched = false;
     875             : 
     876             : /* For WALInsertLockAcquire/Release functions */
     877             : static int  MyLockNo = 0;
     878             : static bool holdingAllLocks = false;
     879             : 
     880             : #ifdef WAL_DEBUG
     881             : static MemoryContext walDebugCxt = NULL;
     882             : #endif
     883             : 
     884             : static void readRecoverySignalFile(void);
     885             : static void validateRecoveryParameters(void);
     886             : static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
     887             : static bool recoveryStopsBefore(XLogReaderState *record);
     888             : static bool recoveryStopsAfter(XLogReaderState *record);
     889             : static void recoveryPausesHere(bool endOfRecovery);
     890             : static bool recoveryApplyDelay(XLogReaderState *record);
     891             : static void SetLatestXTime(TimestampTz xtime);
     892             : static void SetCurrentChunkStartTime(TimestampTz xtime);
     893             : static void CheckRequiredParameterValues(void);
     894             : static void XLogReportParameters(void);
     895             : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
     896             :                                 TimeLineID prevTLI);
     897             : static void LocalSetXLogInsertAllowed(void);
     898             : static void CreateEndOfRecoveryRecord(void);
     899             : static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
     900             : static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
     901             : static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
     902             : 
     903             : static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
     904             : static bool XLogCheckpointNeeded(XLogSegNo new_segno);
     905             : static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
     906             : static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
     907             :                                    bool find_free, XLogSegNo max_segno,
     908             :                                    bool use_lock);
     909             : static int  XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
     910             :                          XLogSource source, bool notfoundOk);
     911             : static int  XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
     912             : static int  XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
     913             :                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
     914             : static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
     915             :                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
     916             : static int  emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
     917             : static void XLogFileClose(void);
     918             : static void PreallocXlogFiles(XLogRecPtr endptr);
     919             : static void RemoveTempXlogFiles(void);
     920             : static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr);
     921             : static void RemoveXlogFile(const char *segname, XLogRecPtr lastredoptr, XLogRecPtr endptr);
     922             : static void UpdateLastRemovedPtr(char *filename);
     923             : static void ValidateXLOGDirectoryStructure(void);
     924             : static void CleanupBackupHistory(void);
     925             : static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
     926             : static XLogRecord *ReadRecord(XLogReaderState *xlogreader,
     927             :                               int emode, bool fetching_ckpt);
     928             : static void CheckRecoveryConsistency(void);
     929             : static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
     930             :                                         XLogRecPtr RecPtr, int whichChkpt, bool report);
     931             : static bool rescanLatestTimeLine(void);
     932             : static void InitControlFile(uint64 sysidentifier);
     933             : static void WriteControlFile(void);
     934             : static void ReadControlFile(void);
     935             : static char *str_time(pg_time_t tnow);
     936             : static void SetPromoteIsTriggered(void);
     937             : static bool CheckForStandbyTrigger(void);
     938             : 
     939             : #ifdef WAL_DEBUG
     940             : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
     941             : #endif
     942             : static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
     943             : static void pg_start_backup_callback(int code, Datum arg);
     944             : static void pg_stop_backup_callback(int code, Datum arg);
     945             : static bool read_backup_label(XLogRecPtr *checkPointLoc,
     946             :                               bool *backupEndRequired, bool *backupFromStandby);
     947             : static bool read_tablespace_map(List **tablespaces);
     948             : 
     949             : static void rm_redo_error_callback(void *arg);
     950             : static int  get_sync_bit(int method);
     951             : 
     952             : static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
     953             :                                 XLogRecData *rdata,
     954             :                                 XLogRecPtr StartPos, XLogRecPtr EndPos);
     955             : static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
     956             :                                       XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
     957             : static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
     958             :                               XLogRecPtr *PrevPtr);
     959             : static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
     960             : static char *GetXLogBuffer(XLogRecPtr ptr);
     961             : static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
     962             : static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
     963             : static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
     964             : static void checkXLogConsistency(XLogReaderState *record);
     965             : 
     966             : static void WALInsertLockAcquire(void);
     967             : static void WALInsertLockAcquireExclusive(void);
     968             : static void WALInsertLockRelease(void);
     969             : static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
     970             : 
     971             : /*
     972             :  * Insert an XLOG record represented by an already-constructed chain of data
     973             :  * chunks.  This is a low-level routine; to construct the WAL record header
     974             :  * and data, use the higher-level routines in xloginsert.c.
     975             :  *
     976             :  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
     977             :  * WAL record applies to, that were not included in the record as full page
     978             :  * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
     979             :  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
     980             :  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
     981             :  * record is always inserted.
     982             :  *
     983             :  * 'flags' gives more in-depth control on the record being inserted. See
     984             :  * XLogSetRecordFlags() for details.
     985             :  *
     986             :  * The first XLogRecData in the chain must be for the record header, and its
     987             :  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
     988             :  * xl_crc fields in the header, the rest of the header must already be filled
     989             :  * by the caller.
     990             :  *
     991             :  * Returns XLOG pointer to end of record (beginning of next record).
     992             :  * This can be used as LSN for data pages affected by the logged action.
     993             :  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
     994             :  * before the data page can be written out.  This implements the basic
     995             :  * WAL rule "write the log before the data".)
     996             :  */
     997             : XLogRecPtr
     998    29389778 : XLogInsertRecord(XLogRecData *rdata,
     999             :                  XLogRecPtr fpw_lsn,
    1000             :                  uint8 flags,
    1001             :                  int num_fpi)
    1002             : {
    1003    29389778 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1004             :     pg_crc32c   rdata_crc;
    1005             :     bool        inserted;
    1006    29389778 :     XLogRecord *rechdr = (XLogRecord *) rdata->data;
    1007    29389778 :     uint8       info = rechdr->xl_info & ~XLR_INFO_MASK;
    1008    29389778 :     bool        isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
    1009             :                                info == XLOG_SWITCH);
    1010             :     XLogRecPtr  StartPos;
    1011             :     XLogRecPtr  EndPos;
    1012    29389778 :     bool        prevDoPageWrites = doPageWrites;
    1013             : 
    1014             :     /* we assume that all of the record header is in the first chunk */
    1015             :     Assert(rdata->len >= SizeOfXLogRecord);
    1016             : 
    1017             :     /* cross-check on whether we should be here or not */
    1018    29389778 :     if (!XLogInsertAllowed())
    1019           0 :         elog(ERROR, "cannot make new WAL entries during recovery");
    1020             : 
    1021             :     /*----------
    1022             :      *
    1023             :      * We have now done all the preparatory work we can without holding a
    1024             :      * lock or modifying shared state. From here on, inserting the new WAL
    1025             :      * record to the shared WAL buffer cache is a two-step process:
    1026             :      *
    1027             :      * 1. Reserve the right amount of space from the WAL. The current head of
    1028             :      *    reserved space is kept in Insert->CurrBytePos, and is protected by
    1029             :      *    insertpos_lck.
    1030             :      *
    1031             :      * 2. Copy the record to the reserved WAL space. This involves finding the
    1032             :      *    correct WAL buffer containing the reserved space, and copying the
    1033             :      *    record in place. This can be done concurrently in multiple processes.
    1034             :      *
    1035             :      * To keep track of which insertions are still in-progress, each concurrent
    1036             :      * inserter acquires an insertion lock. In addition to just indicating that
    1037             :      * an insertion is in progress, the lock tells others how far the inserter
    1038             :      * has progressed. There is a small fixed number of insertion locks,
    1039             :      * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
    1040             :      * boundary, it updates the value stored in the lock to the how far it has
    1041             :      * inserted, to allow the previous buffer to be flushed.
    1042             :      *
    1043             :      * Holding onto an insertion lock also protects RedoRecPtr and
    1044             :      * fullPageWrites from changing until the insertion is finished.
    1045             :      *
    1046             :      * Step 2 can usually be done completely in parallel. If the required WAL
    1047             :      * page is not initialized yet, you have to grab WALBufMappingLock to
    1048             :      * initialize it, but the WAL writer tries to do that ahead of insertions
    1049             :      * to avoid that from happening in the critical path.
    1050             :      *
    1051             :      *----------
    1052             :      */
    1053    29389778 :     START_CRIT_SECTION();
    1054    29389778 :     if (isLogSwitch)
    1055         344 :         WALInsertLockAcquireExclusive();
    1056             :     else
    1057    29389434 :         WALInsertLockAcquire();
    1058             : 
    1059             :     /*
    1060             :      * Check to see if my copy of RedoRecPtr is out of date. If so, may have
    1061             :      * to go back and have the caller recompute everything. This can only
    1062             :      * happen just after a checkpoint, so it's better to be slow in this case
    1063             :      * and fast otherwise.
    1064             :      *
    1065             :      * Also check to see if fullPageWrites or forcePageWrites was just turned
    1066             :      * on; if we weren't already doing full-page writes then go back and
    1067             :      * recompute.
    1068             :      *
    1069             :      * If we aren't doing full-page writes then RedoRecPtr doesn't actually
    1070             :      * affect the contents of the XLOG record, so we'll update our local copy
    1071             :      * but not force a recomputation.  (If doPageWrites was just turned off,
    1072             :      * we could recompute the record without full pages, but we choose not to
    1073             :      * bother.)
    1074             :      */
    1075    29389778 :     if (RedoRecPtr != Insert->RedoRecPtr)
    1076             :     {
    1077             :         Assert(RedoRecPtr < Insert->RedoRecPtr);
    1078         728 :         RedoRecPtr = Insert->RedoRecPtr;
    1079             :     }
    1080    29389778 :     doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
    1081             : 
    1082    29389778 :     if (doPageWrites &&
    1083    29233508 :         (!prevDoPageWrites ||
    1084    28206486 :          (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
    1085             :     {
    1086             :         /*
    1087             :          * Oops, some buffer now needs to be backed up that the caller didn't
    1088             :          * back up.  Start over.
    1089             :          */
    1090          30 :         WALInsertLockRelease();
    1091          30 :         END_CRIT_SECTION();
    1092          30 :         return InvalidXLogRecPtr;
    1093             :     }
    1094             : 
    1095             :     /*
    1096             :      * Reserve space for the record in the WAL. This also sets the xl_prev
    1097             :      * pointer.
    1098             :      */
    1099    29389748 :     if (isLogSwitch)
    1100         344 :         inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
    1101             :     else
    1102             :     {
    1103    29389404 :         ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
    1104             :                                   &rechdr->xl_prev);
    1105    29389404 :         inserted = true;
    1106             :     }
    1107             : 
    1108    29389748 :     if (inserted)
    1109             :     {
    1110             :         /*
    1111             :          * Now that xl_prev has been filled in, calculate CRC of the record
    1112             :          * header.
    1113             :          */
    1114    29389694 :         rdata_crc = rechdr->xl_crc;
    1115    29389694 :         COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
    1116    29389694 :         FIN_CRC32C(rdata_crc);
    1117    29389694 :         rechdr->xl_crc = rdata_crc;
    1118             : 
    1119             :         /*
    1120             :          * All the record data, including the header, is now ready to be
    1121             :          * inserted. Copy the record in the space reserved.
    1122             :          */
    1123    29389694 :         CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
    1124             :                             StartPos, EndPos);
    1125             : 
    1126             :         /*
    1127             :          * Unless record is flagged as not important, update LSN of last
    1128             :          * important record in the current slot. When holding all locks, just
    1129             :          * update the first one.
    1130             :          */
    1131    29389694 :         if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
    1132             :         {
    1133    29241236 :             int         lockno = holdingAllLocks ? 0 : MyLockNo;
    1134             : 
    1135    29241236 :             WALInsertLocks[lockno].l.lastImportantAt = StartPos;
    1136             :         }
    1137             :     }
    1138             :     else
    1139             :     {
    1140             :         /*
    1141             :          * This was an xlog-switch record, but the current insert location was
    1142             :          * already exactly at the beginning of a segment, so there was no need
    1143             :          * to do anything.
    1144             :          */
    1145             :     }
    1146             : 
    1147             :     /*
    1148             :      * Done! Let others know that we're finished.
    1149             :      */
    1150    29389748 :     WALInsertLockRelease();
    1151             : 
    1152    29389748 :     MarkCurrentTransactionIdLoggedIfAny();
    1153             : 
    1154    29389748 :     END_CRIT_SECTION();
    1155             : 
    1156             :     /*
    1157             :      * Update shared LogwrtRqst.Write, if we crossed page boundary.
    1158             :      */
    1159    29389748 :     if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
    1160             :     {
    1161      469472 :         SpinLockAcquire(&XLogCtl->info_lck);
    1162             :         /* advance global request to include new block(s) */
    1163      469472 :         if (XLogCtl->LogwrtRqst.Write < EndPos)
    1164      469408 :             XLogCtl->LogwrtRqst.Write = EndPos;
    1165             :         /* update local result copy while I have the chance */
    1166      469472 :         LogwrtResult = XLogCtl->LogwrtResult;
    1167      469472 :         SpinLockRelease(&XLogCtl->info_lck);
    1168             :     }
    1169             : 
    1170             :     /*
    1171             :      * If this was an XLOG_SWITCH record, flush the record and the empty
    1172             :      * padding space that fills the rest of the segment, and perform
    1173             :      * end-of-segment actions (eg, notifying archiver).
    1174             :      */
    1175    29389748 :     if (isLogSwitch)
    1176             :     {
    1177             :         TRACE_POSTGRESQL_WAL_SWITCH();
    1178         344 :         XLogFlush(EndPos);
    1179             : 
    1180             :         /*
    1181             :          * Even though we reserved the rest of the segment for us, which is
    1182             :          * reflected in EndPos, we return a pointer to just the end of the
    1183             :          * xlog-switch record.
    1184             :          */
    1185         344 :         if (inserted)
    1186             :         {
    1187         290 :             EndPos = StartPos + SizeOfXLogRecord;
    1188         290 :             if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
    1189             :             {
    1190           2 :                 uint64      offset = XLogSegmentOffset(EndPos, wal_segment_size);
    1191             : 
    1192           2 :                 if (offset == EndPos % XLOG_BLCKSZ)
    1193           0 :                     EndPos += SizeOfXLogLongPHD;
    1194             :                 else
    1195           2 :                     EndPos += SizeOfXLogShortPHD;
    1196             :             }
    1197             :         }
    1198             :     }
    1199             : 
    1200             : #ifdef WAL_DEBUG
    1201             :     if (XLOG_DEBUG)
    1202             :     {
    1203             :         static XLogReaderState *debug_reader = NULL;
    1204             :         StringInfoData buf;
    1205             :         StringInfoData recordBuf;
    1206             :         char       *errormsg = NULL;
    1207             :         MemoryContext oldCxt;
    1208             : 
    1209             :         oldCxt = MemoryContextSwitchTo(walDebugCxt);
    1210             : 
    1211             :         initStringInfo(&buf);
    1212             :         appendStringInfo(&buf, "INSERT @ %X/%X: ",
    1213             :                          (uint32) (EndPos >> 32), (uint32) EndPos);
    1214             : 
    1215             :         /*
    1216             :          * We have to piece together the WAL record data from the XLogRecData
    1217             :          * entries, so that we can pass it to the rm_desc function as one
    1218             :          * contiguous chunk.
    1219             :          */
    1220             :         initStringInfo(&recordBuf);
    1221             :         for (; rdata != NULL; rdata = rdata->next)
    1222             :             appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
    1223             : 
    1224             :         if (!debug_reader)
    1225             :             debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
    1226             :                                               XL_ROUTINE(), NULL);
    1227             : 
    1228             :         if (!debug_reader)
    1229             :         {
    1230             :             appendStringInfoString(&buf, "error decoding record: out of memory");
    1231             :         }
    1232             :         else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
    1233             :                                    &errormsg))
    1234             :         {
    1235             :             appendStringInfo(&buf, "error decoding record: %s",
    1236             :                              errormsg ? errormsg : "no error message");
    1237             :         }
    1238             :         else
    1239             :         {
    1240             :             appendStringInfoString(&buf, " - ");
    1241             :             xlog_outdesc(&buf, debug_reader);
    1242             :         }
    1243             :         elog(LOG, "%s", buf.data);
    1244             : 
    1245             :         pfree(buf.data);
    1246             :         pfree(recordBuf.data);
    1247             :         MemoryContextSwitchTo(oldCxt);
    1248             :     }
    1249             : #endif
    1250             : 
    1251             :     /*
    1252             :      * Update our global variables
    1253             :      */
    1254    29389748 :     ProcLastRecPtr = StartPos;
    1255    29389748 :     XactLastRecEnd = EndPos;
    1256             : 
    1257             :     /* Report WAL traffic to the instrumentation. */
    1258    29389748 :     if (inserted)
    1259             :     {
    1260    29389694 :         pgWalUsage.wal_bytes += rechdr->xl_tot_len;
    1261    29389694 :         pgWalUsage.wal_records++;
    1262    29389694 :         pgWalUsage.wal_fpi += num_fpi;
    1263             :     }
    1264             : 
    1265    29389748 :     return EndPos;
    1266             : }
    1267             : 
    1268             : /*
    1269             :  * Reserves the right amount of space for a record of given size from the WAL.
    1270             :  * *StartPos is set to the beginning of the reserved section, *EndPos to
    1271             :  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
    1272             :  * used to set the xl_prev of this record.
    1273             :  *
    1274             :  * This is the performance critical part of XLogInsert that must be serialized
    1275             :  * across backends. The rest can happen mostly in parallel. Try to keep this
    1276             :  * section as short as possible, insertpos_lck can be heavily contended on a
    1277             :  * busy system.
    1278             :  *
    1279             :  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
    1280             :  * where we actually copy the record to the reserved space.
    1281             :  */
    1282             : static void
    1283    29389404 : ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
    1284             :                           XLogRecPtr *PrevPtr)
    1285             : {
    1286    29389404 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1287             :     uint64      startbytepos;
    1288             :     uint64      endbytepos;
    1289             :     uint64      prevbytepos;
    1290             : 
    1291    29389404 :     size = MAXALIGN(size);
    1292             : 
    1293             :     /* All (non xlog-switch) records should contain data. */
    1294             :     Assert(size > SizeOfXLogRecord);
    1295             : 
    1296             :     /*
    1297             :      * The duration the spinlock needs to be held is minimized by minimizing
    1298             :      * the calculations that have to be done while holding the lock. The
    1299             :      * current tip of reserved WAL is kept in CurrBytePos, as a byte position
    1300             :      * that only counts "usable" bytes in WAL, that is, it excludes all WAL
    1301             :      * page headers. The mapping between "usable" byte positions and physical
    1302             :      * positions (XLogRecPtrs) can be done outside the locked region, and
    1303             :      * because the usable byte position doesn't include any headers, reserving
    1304             :      * X bytes from WAL is almost as simple as "CurrBytePos += X".
    1305             :      */
    1306    29389404 :     SpinLockAcquire(&Insert->insertpos_lck);
    1307             : 
    1308    29389404 :     startbytepos = Insert->CurrBytePos;
    1309    29389404 :     endbytepos = startbytepos + size;
    1310    29389404 :     prevbytepos = Insert->PrevBytePos;
    1311    29389404 :     Insert->CurrBytePos = endbytepos;
    1312    29389404 :     Insert->PrevBytePos = startbytepos;
    1313             : 
    1314    29389404 :     SpinLockRelease(&Insert->insertpos_lck);
    1315             : 
    1316    29389404 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1317    29389404 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1318    29389404 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1319             : 
    1320             :     /*
    1321             :      * Check that the conversions between "usable byte positions" and
    1322             :      * XLogRecPtrs work consistently in both directions.
    1323             :      */
    1324             :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1325             :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1326             :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1327    29389404 : }
    1328             : 
    1329             : /*
    1330             :  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
    1331             :  *
    1332             :  * A log-switch record is handled slightly differently. The rest of the
    1333             :  * segment will be reserved for this insertion, as indicated by the returned
    1334             :  * *EndPos value. However, if we are already at the beginning of the current
    1335             :  * segment, *StartPos and *EndPos are set to the current location without
    1336             :  * reserving any space, and the function returns false.
    1337             : */
    1338             : static bool
    1339         344 : ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
    1340             : {
    1341         344 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1342             :     uint64      startbytepos;
    1343             :     uint64      endbytepos;
    1344             :     uint64      prevbytepos;
    1345         344 :     uint32      size = MAXALIGN(SizeOfXLogRecord);
    1346             :     XLogRecPtr  ptr;
    1347             :     uint32      segleft;
    1348             : 
    1349             :     /*
    1350             :      * These calculations are a bit heavy-weight to be done while holding a
    1351             :      * spinlock, but since we're holding all the WAL insertion locks, there
    1352             :      * are no other inserters competing for it. GetXLogInsertRecPtr() does
    1353             :      * compete for it, but that's not called very frequently.
    1354             :      */
    1355         344 :     SpinLockAcquire(&Insert->insertpos_lck);
    1356             : 
    1357         344 :     startbytepos = Insert->CurrBytePos;
    1358             : 
    1359         344 :     ptr = XLogBytePosToEndRecPtr(startbytepos);
    1360         344 :     if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
    1361             :     {
    1362          54 :         SpinLockRelease(&Insert->insertpos_lck);
    1363          54 :         *EndPos = *StartPos = ptr;
    1364          54 :         return false;
    1365             :     }
    1366             : 
    1367         290 :     endbytepos = startbytepos + size;
    1368         290 :     prevbytepos = Insert->PrevBytePos;
    1369             : 
    1370         290 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1371         290 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1372             : 
    1373         290 :     segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
    1374         290 :     if (segleft != wal_segment_size)
    1375             :     {
    1376             :         /* consume the rest of the segment */
    1377         290 :         *EndPos += segleft;
    1378         290 :         endbytepos = XLogRecPtrToBytePos(*EndPos);
    1379             :     }
    1380         290 :     Insert->CurrBytePos = endbytepos;
    1381         290 :     Insert->PrevBytePos = startbytepos;
    1382             : 
    1383         290 :     SpinLockRelease(&Insert->insertpos_lck);
    1384             : 
    1385         290 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1386             : 
    1387             :     Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
    1388             :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1389             :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1390             :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1391             : 
    1392         290 :     return true;
    1393             : }
    1394             : 
    1395             : /*
    1396             :  * Checks whether the current buffer page and backup page stored in the
    1397             :  * WAL record are consistent or not. Before comparing the two pages, a
    1398             :  * masking can be applied to the pages to ignore certain areas like hint bits,
    1399             :  * unused space between pd_lower and pd_upper among other things. This
    1400             :  * function should be called once WAL replay has been completed for a
    1401             :  * given record.
    1402             :  */
    1403             : static void
    1404           0 : checkXLogConsistency(XLogReaderState *record)
    1405             : {
    1406           0 :     RmgrId      rmid = XLogRecGetRmid(record);
    1407             :     RelFileNode rnode;
    1408             :     ForkNumber  forknum;
    1409             :     BlockNumber blkno;
    1410             :     int         block_id;
    1411             : 
    1412             :     /* Records with no backup blocks have no need for consistency checks. */
    1413           0 :     if (!XLogRecHasAnyBlockRefs(record))
    1414           0 :         return;
    1415             : 
    1416             :     Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
    1417             : 
    1418           0 :     for (block_id = 0; block_id <= record->max_block_id; block_id++)
    1419             :     {
    1420             :         Buffer      buf;
    1421             :         Page        page;
    1422             : 
    1423           0 :         if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
    1424             :         {
    1425             :             /*
    1426             :              * WAL record doesn't contain a block reference with the given id.
    1427             :              * Do nothing.
    1428             :              */
    1429           0 :             continue;
    1430             :         }
    1431             : 
    1432             :         Assert(XLogRecHasBlockImage(record, block_id));
    1433             : 
    1434           0 :         if (XLogRecBlockImageApply(record, block_id))
    1435             :         {
    1436             :             /*
    1437             :              * WAL record has already applied the page, so bypass the
    1438             :              * consistency check as that would result in comparing the full
    1439             :              * page stored in the record with itself.
    1440             :              */
    1441           0 :             continue;
    1442             :         }
    1443             : 
    1444             :         /*
    1445             :          * Read the contents from the current buffer and store it in a
    1446             :          * temporary page.
    1447             :          */
    1448           0 :         buf = XLogReadBufferExtended(rnode, forknum, blkno,
    1449             :                                      RBM_NORMAL_NO_LOG);
    1450           0 :         if (!BufferIsValid(buf))
    1451           0 :             continue;
    1452             : 
    1453           0 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
    1454           0 :         page = BufferGetPage(buf);
    1455             : 
    1456             :         /*
    1457             :          * Take a copy of the local page where WAL has been applied to have a
    1458             :          * comparison base before masking it...
    1459             :          */
    1460           0 :         memcpy(replay_image_masked, page, BLCKSZ);
    1461             : 
    1462             :         /* No need for this page anymore now that a copy is in. */
    1463           0 :         UnlockReleaseBuffer(buf);
    1464             : 
    1465             :         /*
    1466             :          * If the block LSN is already ahead of this WAL record, we can't
    1467             :          * expect contents to match.  This can happen if recovery is
    1468             :          * restarted.
    1469             :          */
    1470           0 :         if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
    1471           0 :             continue;
    1472             : 
    1473             :         /*
    1474             :          * Read the contents from the backup copy, stored in WAL record and
    1475             :          * store it in a temporary page. There is no need to allocate a new
    1476             :          * page here, a local buffer is fine to hold its contents and a mask
    1477             :          * can be directly applied on it.
    1478             :          */
    1479           0 :         if (!RestoreBlockImage(record, block_id, master_image_masked))
    1480           0 :             elog(ERROR, "failed to restore block image");
    1481             : 
    1482             :         /*
    1483             :          * If masking function is defined, mask both the master and replay
    1484             :          * images
    1485             :          */
    1486           0 :         if (RmgrTable[rmid].rm_mask != NULL)
    1487             :         {
    1488           0 :             RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
    1489           0 :             RmgrTable[rmid].rm_mask(master_image_masked, blkno);
    1490             :         }
    1491             : 
    1492             :         /* Time to compare the master and replay images. */
    1493           0 :         if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
    1494             :         {
    1495           0 :             elog(FATAL,
    1496             :                  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
    1497             :                  rnode.spcNode, rnode.dbNode, rnode.relNode,
    1498             :                  forknum, blkno);
    1499             :         }
    1500             :     }
    1501             : }
    1502             : 
    1503             : /*
    1504             :  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
    1505             :  * area in the WAL.
    1506             :  */
    1507             : static void
    1508    29389694 : CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
    1509             :                     XLogRecPtr StartPos, XLogRecPtr EndPos)
    1510             : {
    1511             :     char       *currpos;
    1512             :     int         freespace;
    1513             :     int         written;
    1514             :     XLogRecPtr  CurrPos;
    1515             :     XLogPageHeader pagehdr;
    1516             : 
    1517             :     /*
    1518             :      * Get a pointer to the right place in the right WAL buffer to start
    1519             :      * inserting to.
    1520             :      */
    1521    29389694 :     CurrPos = StartPos;
    1522    29389694 :     currpos = GetXLogBuffer(CurrPos);
    1523    29389694 :     freespace = INSERT_FREESPACE(CurrPos);
    1524             : 
    1525             :     /*
    1526             :      * there should be enough space for at least the first field (xl_tot_len)
    1527             :      * on this page.
    1528             :      */
    1529             :     Assert(freespace >= sizeof(uint32));
    1530             : 
    1531             :     /* Copy record data */
    1532    29389694 :     written = 0;
    1533   131594392 :     while (rdata != NULL)
    1534             :     {
    1535   102204698 :         char       *rdata_data = rdata->data;
    1536   102204698 :         int         rdata_len = rdata->len;
    1537             : 
    1538   102668282 :         while (rdata_len > freespace)
    1539             :         {
    1540             :             /*
    1541             :              * Write what fits on this page, and continue on the next page.
    1542             :              */
    1543             :             Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
    1544      463584 :             memcpy(currpos, rdata_data, freespace);
    1545      463584 :             rdata_data += freespace;
    1546      463584 :             rdata_len -= freespace;
    1547      463584 :             written += freespace;
    1548      463584 :             CurrPos += freespace;
    1549             : 
    1550             :             /*
    1551             :              * Get pointer to beginning of next page, and set the xlp_rem_len
    1552             :              * in the page header. Set XLP_FIRST_IS_CONTRECORD.
    1553             :              *
    1554             :              * It's safe to set the contrecord flag and xlp_rem_len without a
    1555             :              * lock on the page. All the other flags were already set when the
    1556             :              * page was initialized, in AdvanceXLInsertBuffer, and we're the
    1557             :              * only backend that needs to set the contrecord flag.
    1558             :              */
    1559      463584 :             currpos = GetXLogBuffer(CurrPos);
    1560      463584 :             pagehdr = (XLogPageHeader) currpos;
    1561      463584 :             pagehdr->xlp_rem_len = write_len - written;
    1562      463584 :             pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
    1563             : 
    1564             :             /* skip over the page header */
    1565      463584 :             if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
    1566             :             {
    1567         566 :                 CurrPos += SizeOfXLogLongPHD;
    1568         566 :                 currpos += SizeOfXLogLongPHD;
    1569             :             }
    1570             :             else
    1571             :             {
    1572      463018 :                 CurrPos += SizeOfXLogShortPHD;
    1573      463018 :                 currpos += SizeOfXLogShortPHD;
    1574             :             }
    1575      463584 :             freespace = INSERT_FREESPACE(CurrPos);
    1576             :         }
    1577             : 
    1578             :         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
    1579   102204698 :         memcpy(currpos, rdata_data, rdata_len);
    1580   102204698 :         currpos += rdata_len;
    1581   102204698 :         CurrPos += rdata_len;
    1582   102204698 :         freespace -= rdata_len;
    1583   102204698 :         written += rdata_len;
    1584             : 
    1585   102204698 :         rdata = rdata->next;
    1586             :     }
    1587             :     Assert(written == write_len);
    1588             : 
    1589             :     /*
    1590             :      * If this was an xlog-switch, it's not enough to write the switch record,
    1591             :      * we also have to consume all the remaining space in the WAL segment.  We
    1592             :      * have already reserved that space, but we need to actually fill it.
    1593             :      */
    1594    29389984 :     if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
    1595             :     {
    1596             :         /* An xlog-switch record doesn't contain any data besides the header */
    1597             :         Assert(write_len == SizeOfXLogRecord);
    1598             : 
    1599             :         /* Assert that we did reserve the right amount of space */
    1600             :         Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
    1601             : 
    1602             :         /* Use up all the remaining space on the current page */
    1603         290 :         CurrPos += freespace;
    1604             : 
    1605             :         /*
    1606             :          * Cause all remaining pages in the segment to be flushed, leaving the
    1607             :          * XLog position where it should be, at the start of the next segment.
    1608             :          * We do this one page at a time, to make sure we don't deadlock
    1609             :          * against ourselves if wal_buffers < wal_segment_size.
    1610             :          */
    1611      434484 :         while (CurrPos < EndPos)
    1612             :         {
    1613             :             /*
    1614             :              * The minimal action to flush the page would be to call
    1615             :              * WALInsertLockUpdateInsertingAt(CurrPos) followed by
    1616             :              * AdvanceXLInsertBuffer(...).  The page would be left initialized
    1617             :              * mostly to zeros, except for the page header (always the short
    1618             :              * variant, as this is never a segment's first page).
    1619             :              *
    1620             :              * The large vistas of zeros are good for compressibility, but the
    1621             :              * headers interrupting them every XLOG_BLCKSZ (with values that
    1622             :              * differ from page to page) are not.  The effect varies with
    1623             :              * compression tool, but bzip2 for instance compresses about an
    1624             :              * order of magnitude worse if those headers are left in place.
    1625             :              *
    1626             :              * Rather than complicating AdvanceXLInsertBuffer itself (which is
    1627             :              * called in heavily-loaded circumstances as well as this lightly-
    1628             :              * loaded one) with variant behavior, we just use GetXLogBuffer
    1629             :              * (which itself calls the two methods we need) to get the pointer
    1630             :              * and zero most of the page.  Then we just zero the page header.
    1631             :              */
    1632      434194 :             currpos = GetXLogBuffer(CurrPos);
    1633     1736776 :             MemSet(currpos, 0, SizeOfXLogShortPHD);
    1634             : 
    1635      434194 :             CurrPos += XLOG_BLCKSZ;
    1636             :         }
    1637             :     }
    1638             :     else
    1639             :     {
    1640             :         /* Align the end position, so that the next record starts aligned */
    1641    29389404 :         CurrPos = MAXALIGN64(CurrPos);
    1642             :     }
    1643             : 
    1644    29389694 :     if (CurrPos != EndPos)
    1645           0 :         elog(PANIC, "space reserved for WAL record does not match what was written");
    1646    29389694 : }
    1647             : 
    1648             : /*
    1649             :  * Acquire a WAL insertion lock, for inserting to WAL.
    1650             :  */
    1651             : static void
    1652    29389434 : WALInsertLockAcquire(void)
    1653             : {
    1654             :     bool        immed;
    1655             : 
    1656             :     /*
    1657             :      * It doesn't matter which of the WAL insertion locks we acquire, so try
    1658             :      * the one we used last time.  If the system isn't particularly busy, it's
    1659             :      * a good bet that it's still available, and it's good to have some
    1660             :      * affinity to a particular lock so that you don't unnecessarily bounce
    1661             :      * cache lines between processes when there's no contention.
    1662             :      *
    1663             :      * If this is the first time through in this backend, pick a lock
    1664             :      * (semi-)randomly.  This allows the locks to be used evenly if you have a
    1665             :      * lot of very short connections.
    1666             :      */
    1667             :     static int  lockToTry = -1;
    1668             : 
    1669    29389434 :     if (lockToTry == -1)
    1670        6800 :         lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
    1671    29389434 :     MyLockNo = lockToTry;
    1672             : 
    1673             :     /*
    1674             :      * The insertingAt value is initially set to 0, as we don't know our
    1675             :      * insert location yet.
    1676             :      */
    1677    29389434 :     immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
    1678    29389434 :     if (!immed)
    1679             :     {
    1680             :         /*
    1681             :          * If we couldn't get the lock immediately, try another lock next
    1682             :          * time.  On a system with more insertion locks than concurrent
    1683             :          * inserters, this causes all the inserters to eventually migrate to a
    1684             :          * lock that no-one else is using.  On a system with more inserters
    1685             :          * than locks, it still helps to distribute the inserters evenly
    1686             :          * across the locks.
    1687             :          */
    1688         228 :         lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
    1689             :     }
    1690    29389434 : }
    1691             : 
    1692             : /*
    1693             :  * Acquire all WAL insertion locks, to prevent other backends from inserting
    1694             :  * to WAL.
    1695             :  */
    1696             : static void
    1697        4062 : WALInsertLockAcquireExclusive(void)
    1698             : {
    1699             :     int         i;
    1700             : 
    1701             :     /*
    1702             :      * When holding all the locks, all but the last lock's insertingAt
    1703             :      * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
    1704             :      * XLogRecPtr value, to make sure that no-one blocks waiting on those.
    1705             :      */
    1706       32496 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
    1707             :     {
    1708       28434 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1709       28434 :         LWLockUpdateVar(&WALInsertLocks[i].l.lock,
    1710       28434 :                         &WALInsertLocks[i].l.insertingAt,
    1711             :                         PG_UINT64_MAX);
    1712             :     }
    1713             :     /* Variable value reset to 0 at release */
    1714        4062 :     LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1715             : 
    1716        4062 :     holdingAllLocks = true;
    1717        4062 : }
    1718             : 
    1719             : /*
    1720             :  * Release our insertion lock (or locks, if we're holding them all).
    1721             :  *
    1722             :  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
    1723             :  * next time the lock is acquired.
    1724             :  */
    1725             : static void
    1726    29393496 : WALInsertLockRelease(void)
    1727             : {
    1728    29393496 :     if (holdingAllLocks)
    1729             :     {
    1730             :         int         i;
    1731             : 
    1732       36558 :         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1733       32496 :             LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
    1734       32496 :                                   &WALInsertLocks[i].l.insertingAt,
    1735             :                                   0);
    1736             : 
    1737        4062 :         holdingAllLocks = false;
    1738             :     }
    1739             :     else
    1740             :     {
    1741    29389434 :         LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
    1742    29389434 :                               &WALInsertLocks[MyLockNo].l.insertingAt,
    1743             :                               0);
    1744             :     }
    1745    29393496 : }
    1746             : 
    1747             : /*
    1748             :  * Update our insertingAt value, to let others know that we've finished
    1749             :  * inserting up to that point.
    1750             :  */
    1751             : static void
    1752      729194 : WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
    1753             : {
    1754      729194 :     if (holdingAllLocks)
    1755             :     {
    1756             :         /*
    1757             :          * We use the last lock to mark our actual position, see comments in
    1758             :          * WALInsertLockAcquireExclusive.
    1759             :          */
    1760      429054 :         LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
    1761      429054 :                         &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
    1762             :                         insertingAt);
    1763             :     }
    1764             :     else
    1765      300140 :         LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
    1766      300140 :                         &WALInsertLocks[MyLockNo].l.insertingAt,
    1767             :                         insertingAt);
    1768      729194 : }
    1769             : 
    1770             : /*
    1771             :  * Wait for any WAL insertions < upto to finish.
    1772             :  *
    1773             :  * Returns the location of the oldest insertion that is still in-progress.
    1774             :  * Any WAL prior to that point has been fully copied into WAL buffers, and
    1775             :  * can be flushed out to disk. Because this waits for any insertions older
    1776             :  * than 'upto' to finish, the return value is always >= 'upto'.
    1777             :  *
    1778             :  * Note: When you are about to write out WAL, you must call this function
    1779             :  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
    1780             :  * need to wait for an insertion to finish (or at least advance to next
    1781             :  * uninitialized page), and the inserter might need to evict an old WAL buffer
    1782             :  * to make room for a new one, which in turn requires WALWriteLock.
    1783             :  */
    1784             : static XLogRecPtr
    1785      692660 : WaitXLogInsertionsToFinish(XLogRecPtr upto)
    1786             : {
    1787             :     uint64      bytepos;
    1788             :     XLogRecPtr  reservedUpto;
    1789             :     XLogRecPtr  finishedUpto;
    1790      692660 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1791             :     int         i;
    1792             : 
    1793      692660 :     if (MyProc == NULL)
    1794           0 :         elog(PANIC, "cannot wait without a PGPROC structure");
    1795             : 
    1796             :     /* Read the current insert position */
    1797      692660 :     SpinLockAcquire(&Insert->insertpos_lck);
    1798      692660 :     bytepos = Insert->CurrBytePos;
    1799      692660 :     SpinLockRelease(&Insert->insertpos_lck);
    1800      692660 :     reservedUpto = XLogBytePosToEndRecPtr(bytepos);
    1801             : 
    1802             :     /*
    1803             :      * No-one should request to flush a piece of WAL that hasn't even been
    1804             :      * reserved yet. However, it can happen if there is a block with a bogus
    1805             :      * LSN on disk, for example. XLogFlush checks for that situation and
    1806             :      * complains, but only after the flush. Here we just assume that to mean
    1807             :      * that all WAL that has been reserved needs to be finished. In this
    1808             :      * corner-case, the return value can be smaller than 'upto' argument.
    1809             :      */
    1810      692660 :     if (upto > reservedUpto)
    1811             :     {
    1812           0 :         elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
    1813             :              (uint32) (upto >> 32), (uint32) upto,
    1814             :              (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
    1815           0 :         upto = reservedUpto;
    1816             :     }
    1817             : 
    1818             :     /*
    1819             :      * Loop through all the locks, sleeping on any in-progress insert older
    1820             :      * than 'upto'.
    1821             :      *
    1822             :      * finishedUpto is our return value, indicating the point upto which all
    1823             :      * the WAL insertions have been finished. Initialize it to the head of
    1824             :      * reserved WAL, and as we iterate through the insertion locks, back it
    1825             :      * out for any insertion that's still in progress.
    1826             :      */
    1827      692660 :     finishedUpto = reservedUpto;
    1828     6233940 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1829             :     {
    1830     5541280 :         XLogRecPtr  insertingat = InvalidXLogRecPtr;
    1831             : 
    1832             :         do
    1833             :         {
    1834             :             /*
    1835             :              * See if this insertion is in progress.  LWLockWaitForVar will
    1836             :              * wait for the lock to be released, or for the 'value' to be set
    1837             :              * by a LWLockUpdateVar call.  When a lock is initially acquired,
    1838             :              * its value is 0 (InvalidXLogRecPtr), which means that we don't
    1839             :              * know where it's inserting yet.  We will have to wait for it. If
    1840             :              * it's a small insertion, the record will most likely fit on the
    1841             :              * same page and the inserter will release the lock without ever
    1842             :              * calling LWLockUpdateVar.  But if it has to sleep, it will
    1843             :              * advertise the insertion point with LWLockUpdateVar before
    1844             :              * sleeping.
    1845             :              */
    1846     5541422 :             if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
    1847     5541422 :                                  &WALInsertLocks[i].l.insertingAt,
    1848             :                                  insertingat, &insertingat))
    1849             :             {
    1850             :                 /* the lock was free, so no insertion in progress */
    1851     2328508 :                 insertingat = InvalidXLogRecPtr;
    1852     2328508 :                 break;
    1853             :             }
    1854             : 
    1855             :             /*
    1856             :              * This insertion is still in progress. Have to wait, unless the
    1857             :              * inserter has proceeded past 'upto'.
    1858             :              */
    1859     3212914 :         } while (insertingat < upto);
    1860             : 
    1861     5541280 :         if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
    1862      420146 :             finishedUpto = insertingat;
    1863             :     }
    1864      692660 :     return finishedUpto;
    1865             : }
    1866             : 
    1867             : /*
    1868             :  * Get a pointer to the right location in the WAL buffer containing the
    1869             :  * given XLogRecPtr.
    1870             :  *
    1871             :  * If the page is not initialized yet, it is initialized. That might require
    1872             :  * evicting an old dirty buffer from the buffer cache, which means I/O.
    1873             :  *
    1874             :  * The caller must ensure that the page containing the requested location
    1875             :  * isn't evicted yet, and won't be evicted. The way to ensure that is to
    1876             :  * hold onto a WAL insertion lock with the insertingAt position set to
    1877             :  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
    1878             :  * to evict an old page from the buffer. (This means that once you call
    1879             :  * GetXLogBuffer() with a given 'ptr', you must not access anything before
    1880             :  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
    1881             :  * later, because older buffers might be recycled already)
    1882             :  */
    1883             : static char *
    1884    30287472 : GetXLogBuffer(XLogRecPtr ptr)
    1885             : {
    1886             :     int         idx;
    1887             :     XLogRecPtr  endptr;
    1888             :     static uint64 cachedPage = 0;
    1889             :     static char *cachedPos = NULL;
    1890             :     XLogRecPtr  expectedEndPtr;
    1891             : 
    1892             :     /*
    1893             :      * Fast path for the common case that we need to access again the same
    1894             :      * page as last time.
    1895             :      */
    1896    30287472 :     if (ptr / XLOG_BLCKSZ == cachedPage)
    1897             :     {
    1898             :         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1899             :         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1900    29273928 :         return cachedPos + ptr % XLOG_BLCKSZ;
    1901             :     }
    1902             : 
    1903             :     /*
    1904             :      * The XLog buffer cache is organized so that a page is always loaded to a
    1905             :      * particular buffer.  That way we can easily calculate the buffer a given
    1906             :      * page must be loaded into, from the XLogRecPtr alone.
    1907             :      */
    1908     1013544 :     idx = XLogRecPtrToBufIdx(ptr);
    1909             : 
    1910             :     /*
    1911             :      * See what page is loaded in the buffer at the moment. It could be the
    1912             :      * page we're looking for, or something older. It can't be anything newer
    1913             :      * - that would imply the page we're looking for has already been written
    1914             :      * out to disk and evicted, and the caller is responsible for making sure
    1915             :      * that doesn't happen.
    1916             :      *
    1917             :      * However, we don't hold a lock while we read the value. If someone has
    1918             :      * just initialized the page, it's possible that we get a "torn read" of
    1919             :      * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
    1920             :      * that case we will see a bogus value. That's ok, we'll grab the mapping
    1921             :      * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
    1922             :      * the page we're looking for. But it means that when we do this unlocked
    1923             :      * read, we might see a value that appears to be ahead of the page we're
    1924             :      * looking for. Don't PANIC on that, until we've verified the value while
    1925             :      * holding the lock.
    1926             :      */
    1927     1013544 :     expectedEndPtr = ptr;
    1928     1013544 :     expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
    1929             : 
    1930     1013544 :     endptr = XLogCtl->xlblocks[idx];
    1931     1013544 :     if (expectedEndPtr != endptr)
    1932             :     {
    1933             :         XLogRecPtr  initializedUpto;
    1934             : 
    1935             :         /*
    1936             :          * Before calling AdvanceXLInsertBuffer(), which can block, let others
    1937             :          * know how far we're finished with inserting the record.
    1938             :          *
    1939             :          * NB: If 'ptr' points to just after the page header, advertise a
    1940             :          * position at the beginning of the page rather than 'ptr' itself. If
    1941             :          * there are no other insertions running, someone might try to flush
    1942             :          * up to our advertised location. If we advertised a position after
    1943             :          * the page header, someone might try to flush the page header, even
    1944             :          * though page might actually not be initialized yet. As the first
    1945             :          * inserter on the page, we are effectively responsible for making
    1946             :          * sure that it's initialized, before we let insertingAt to move past
    1947             :          * the page header.
    1948             :          */
    1949      729194 :         if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
    1950       14146 :             XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
    1951       14146 :             initializedUpto = ptr - SizeOfXLogShortPHD;
    1952      715048 :         else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
    1953         240 :                  XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
    1954         190 :             initializedUpto = ptr - SizeOfXLogLongPHD;
    1955             :         else
    1956      714858 :             initializedUpto = ptr;
    1957             : 
    1958      729194 :         WALInsertLockUpdateInsertingAt(initializedUpto);
    1959             : 
    1960      729194 :         AdvanceXLInsertBuffer(ptr, false);
    1961      729194 :         endptr = XLogCtl->xlblocks[idx];
    1962             : 
    1963      729194 :         if (expectedEndPtr != endptr)
    1964           0 :             elog(PANIC, "could not find WAL buffer for %X/%X",
    1965             :                  (uint32) (ptr >> 32), (uint32) ptr);
    1966             :     }
    1967             :     else
    1968             :     {
    1969             :         /*
    1970             :          * Make sure the initialization of the page is visible to us, and
    1971             :          * won't arrive later to overwrite the WAL data we write on the page.
    1972             :          */
    1973      284350 :         pg_memory_barrier();
    1974             :     }
    1975             : 
    1976             :     /*
    1977             :      * Found the buffer holding this page. Return a pointer to the right
    1978             :      * offset within the page.
    1979             :      */
    1980     1013544 :     cachedPage = ptr / XLOG_BLCKSZ;
    1981     1013544 :     cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
    1982             : 
    1983             :     Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1984             :     Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1985             : 
    1986     1013544 :     return cachedPos + ptr % XLOG_BLCKSZ;
    1987             : }
    1988             : 
    1989             : /*
    1990             :  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
    1991             :  * is the position starting from the beginning of WAL, excluding all WAL
    1992             :  * page headers.
    1993             :  */
    1994             : static XLogRecPtr
    1995    58806640 : XLogBytePosToRecPtr(uint64 bytepos)
    1996             : {
    1997             :     uint64      fullsegs;
    1998             :     uint64      fullpages;
    1999             :     uint64      bytesleft;
    2000             :     uint32      seg_offset;
    2001             :     XLogRecPtr  result;
    2002             : 
    2003    58806640 :     fullsegs = bytepos / UsableBytesInSegment;
    2004    58806640 :     bytesleft = bytepos % UsableBytesInSegment;
    2005             : 
    2006    58806640 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    2007             :     {
    2008             :         /* fits on first page of segment */
    2009      101902 :         seg_offset = bytesleft + SizeOfXLogLongPHD;
    2010             :     }
    2011             :     else
    2012             :     {
    2013             :         /* account for the first page on segment with long header */
    2014    58704738 :         seg_offset = XLOG_BLCKSZ;
    2015    58704738 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    2016             : 
    2017    58704738 :         fullpages = bytesleft / UsableBytesInPage;
    2018    58704738 :         bytesleft = bytesleft % UsableBytesInPage;
    2019             : 
    2020    58704738 :         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    2021             :     }
    2022             : 
    2023    58806640 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
    2024             : 
    2025    58806640 :     return result;
    2026             : }
    2027             : 
    2028             : /*
    2029             :  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
    2030             :  * returns a pointer to the beginning of the page (ie. before page header),
    2031             :  * not to where the first xlog record on that page would go to. This is used
    2032             :  * when converting a pointer to the end of a record.
    2033             :  */
    2034             : static XLogRecPtr
    2035    30082698 : XLogBytePosToEndRecPtr(uint64 bytepos)
    2036             : {
    2037             :     uint64      fullsegs;
    2038             :     uint64      fullpages;
    2039             :     uint64      bytesleft;
    2040             :     uint32      seg_offset;
    2041             :     XLogRecPtr  result;
    2042             : 
    2043    30082698 :     fullsegs = bytepos / UsableBytesInSegment;
    2044    30082698 :     bytesleft = bytepos % UsableBytesInSegment;
    2045             : 
    2046    30082698 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    2047             :     {
    2048             :         /* fits on first page of segment */
    2049      450242 :         if (bytesleft == 0)
    2050      399290 :             seg_offset = 0;
    2051             :         else
    2052       50952 :             seg_offset = bytesleft + SizeOfXLogLongPHD;
    2053             :     }
    2054             :     else
    2055             :     {
    2056             :         /* account for the first page on segment with long header */
    2057    29632456 :         seg_offset = XLOG_BLCKSZ;
    2058    29632456 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    2059             : 
    2060    29632456 :         fullpages = bytesleft / UsableBytesInPage;
    2061    29632456 :         bytesleft = bytesleft % UsableBytesInPage;
    2062             : 
    2063    29632456 :         if (bytesleft == 0)
    2064       27786 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
    2065             :         else
    2066    29604670 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    2067             :     }
    2068             : 
    2069    30082698 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
    2070             : 
    2071    30082698 :     return result;
    2072             : }
    2073             : 
    2074             : /*
    2075             :  * Convert an XLogRecPtr to a "usable byte position".
    2076             :  */
    2077             : static uint64
    2078        3002 : XLogRecPtrToBytePos(XLogRecPtr ptr)
    2079             : {
    2080             :     uint64      fullsegs;
    2081             :     uint32      fullpages;
    2082             :     uint32      offset;
    2083             :     uint64      result;
    2084             : 
    2085        3002 :     XLByteToSeg(ptr, fullsegs, wal_segment_size);
    2086             : 
    2087        3002 :     fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
    2088        3002 :     offset = ptr % XLOG_BLCKSZ;
    2089             : 
    2090        3002 :     if (fullpages == 0)
    2091             :     {
    2092        1094 :         result = fullsegs * UsableBytesInSegment;
    2093        1094 :         if (offset > 0)
    2094             :         {
    2095             :             Assert(offset >= SizeOfXLogLongPHD);
    2096         786 :             result += offset - SizeOfXLogLongPHD;
    2097             :         }
    2098             :     }
    2099             :     else
    2100             :     {
    2101        3816 :         result = fullsegs * UsableBytesInSegment +
    2102        3816 :             (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
    2103        1908 :             (fullpages - 1) * UsableBytesInPage;    /* full pages */
    2104        1908 :         if (offset > 0)
    2105             :         {
    2106             :             Assert(offset >= SizeOfXLogShortPHD);
    2107        1908 :             result += offset - SizeOfXLogShortPHD;
    2108             :         }
    2109             :     }
    2110             : 
    2111        3002 :     return result;
    2112             : }
    2113             : 
    2114             : /*
    2115             :  * Initialize XLOG buffers, writing out old buffers if they still contain
    2116             :  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
    2117             :  * true, initialize as many pages as we can without having to write out
    2118             :  * unwritten data. Any new pages are initialized to zeros, with pages headers
    2119             :  * initialized properly.
    2120             :  */
    2121             : static void
    2122      738544 : AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
    2123             : {
    2124      738544 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    2125             :     int         nextidx;
    2126             :     XLogRecPtr  OldPageRqstPtr;
    2127             :     XLogwrtRqst WriteRqst;
    2128      738544 :     XLogRecPtr  NewPageEndPtr = InvalidXLogRecPtr;
    2129             :     XLogRecPtr  NewPageBeginPtr;
    2130             :     XLogPageHeader NewPage;
    2131      738544 :     int         npages = 0;
    2132             : 
    2133      738544 :     LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
    2134             : 
    2135             :     /*
    2136             :      * Now that we have the lock, check if someone initialized the page
    2137             :      * already.
    2138             :      */
    2139     2120960 :     while (upto >= XLogCtl->InitializedUpTo || opportunistic)
    2140             :     {
    2141     1391766 :         nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
    2142             : 
    2143             :         /*
    2144             :          * Get ending-offset of the buffer page we need to replace (this may
    2145             :          * be zero if the buffer hasn't been used yet).  Fall through if it's
    2146             :          * already written out.
    2147             :          */
    2148     1391766 :         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
    2149     1391766 :         if (LogwrtResult.Write < OldPageRqstPtr)
    2150             :         {
    2151             :             /*
    2152             :              * Nope, got work to do. If we just want to pre-initialize as much
    2153             :              * as we can without flushing, give up now.
    2154             :              */
    2155      429782 :             if (opportunistic)
    2156        9350 :                 break;
    2157             : 
    2158             :             /* Before waiting, get info_lck and update LogwrtResult */
    2159      420432 :             SpinLockAcquire(&XLogCtl->info_lck);
    2160      420432 :             if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
    2161      398900 :                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
    2162      420432 :             LogwrtResult = XLogCtl->LogwrtResult;
    2163      420432 :             SpinLockRelease(&XLogCtl->info_lck);
    2164             : 
    2165             :             /*
    2166             :              * Now that we have an up-to-date LogwrtResult value, see if we
    2167             :              * still need to write it or if someone else already did.
    2168             :              */
    2169      420432 :             if (LogwrtResult.Write < OldPageRqstPtr)
    2170             :             {
    2171             :                 /*
    2172             :                  * Must acquire write lock. Release WALBufMappingLock first,
    2173             :                  * to make sure that all insertions that we need to wait for
    2174             :                  * can finish (up to this same position). Otherwise we risk
    2175             :                  * deadlock.
    2176             :                  */
    2177      420066 :                 LWLockRelease(WALBufMappingLock);
    2178             : 
    2179      420066 :                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
    2180             : 
    2181      420066 :                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    2182             : 
    2183      420066 :                 LogwrtResult = XLogCtl->LogwrtResult;
    2184      420066 :                 if (LogwrtResult.Write >= OldPageRqstPtr)
    2185             :                 {
    2186             :                     /* OK, someone wrote it already */
    2187          72 :                     LWLockRelease(WALWriteLock);
    2188             :                 }
    2189             :                 else
    2190             :                 {
    2191             :                     /* Have to write it ourselves */
    2192             :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
    2193      419994 :                     WriteRqst.Write = OldPageRqstPtr;
    2194      419994 :                     WriteRqst.Flush = 0;
    2195      419994 :                     XLogWrite(WriteRqst, false);
    2196      419994 :                     LWLockRelease(WALWriteLock);
    2197             :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
    2198             :                 }
    2199             :                 /* Re-acquire WALBufMappingLock and retry */
    2200      420066 :                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
    2201      420066 :                 continue;
    2202             :             }
    2203             :         }
    2204             : 
    2205             :         /*
    2206             :          * Now the next buffer slot is free and we can set it up to be the
    2207             :          * next output page.
    2208             :          */
    2209      962350 :         NewPageBeginPtr = XLogCtl->InitializedUpTo;
    2210      962350 :         NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
    2211             : 
    2212             :         Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
    2213             : 
    2214      962350 :         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
    2215             : 
    2216             :         /*
    2217             :          * Be sure to re-zero the buffer so that bytes beyond what we've
    2218             :          * written will look like zeroes and not valid XLOG records...
    2219             :          */
    2220      962350 :         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
    2221             : 
    2222             :         /*
    2223             :          * Fill the new page's header
    2224             :          */
    2225      962350 :         NewPage->xlp_magic = XLOG_PAGE_MAGIC;
    2226             : 
    2227             :         /* NewPage->xlp_info = 0; */ /* done by memset */
    2228      962350 :         NewPage->xlp_tli = ThisTimeLineID;
    2229      962350 :         NewPage->xlp_pageaddr = NewPageBeginPtr;
    2230             : 
    2231             :         /* NewPage->xlp_rem_len = 0; */  /* done by memset */
    2232             : 
    2233             :         /*
    2234             :          * If online backup is not in progress, mark the header to indicate
    2235             :          * that WAL records beginning in this page have removable backup
    2236             :          * blocks.  This allows the WAL archiver to know whether it is safe to
    2237             :          * compress archived WAL data by transforming full-block records into
    2238             :          * the non-full-block format.  It is sufficient to record this at the
    2239             :          * page level because we force a page switch (in fact a segment
    2240             :          * switch) when starting a backup, so the flag will be off before any
    2241             :          * records can be written during the backup.  At the end of a backup,
    2242             :          * the last page will be marked as all unsafe when perhaps only part
    2243             :          * is unsafe, but at worst the archiver would miss the opportunity to
    2244             :          * compress a few records.
    2245             :          */
    2246      962350 :         if (!Insert->forcePageWrites)
    2247      831946 :             NewPage->xlp_info |= XLP_BKP_REMOVABLE;
    2248             : 
    2249             :         /*
    2250             :          * If first page of an XLOG segment file, make it a long header.
    2251             :          */
    2252      962350 :         if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
    2253             :         {
    2254         790 :             XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
    2255             : 
    2256         790 :             NewLongPage->xlp_sysid = ControlFile->system_identifier;
    2257         790 :             NewLongPage->xlp_seg_size = wal_segment_size;
    2258         790 :             NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    2259         790 :             NewPage->xlp_info |= XLP_LONG_HEADER;
    2260             :         }
    2261             : 
    2262             :         /*
    2263             :          * Make sure the initialization of the page becomes visible to others
    2264             :          * before the xlblocks update. GetXLogBuffer() reads xlblocks without
    2265             :          * holding a lock.
    2266             :          */
    2267      962350 :         pg_write_barrier();
    2268             : 
    2269      962350 :         *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
    2270             : 
    2271      962350 :         XLogCtl->InitializedUpTo = NewPageEndPtr;
    2272             : 
    2273      962350 :         npages++;
    2274             :     }
    2275      738544 :     LWLockRelease(WALBufMappingLock);
    2276             : 
    2277             : #ifdef WAL_DEBUG
    2278             :     if (XLOG_DEBUG && npages > 0)
    2279             :     {
    2280             :         elog(DEBUG1, "initialized %d pages, up to %X/%X",
    2281             :              npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
    2282             :     }
    2283             : #endif
    2284      738544 : }
    2285             : 
    2286             : /*
    2287             :  * Calculate CheckPointSegments based on max_wal_size_mb and
    2288             :  * checkpoint_completion_target.
    2289             :  */
    2290             : static void
    2291        8558 : CalculateCheckpointSegments(void)
    2292             : {
    2293             :     double      target;
    2294             : 
    2295             :     /*-------
    2296             :      * Calculate the distance at which to trigger a checkpoint, to avoid
    2297             :      * exceeding max_wal_size_mb. This is based on two assumptions:
    2298             :      *
    2299             :      * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
    2300             :      *    WAL for two checkpoint cycles to allow us to recover from the
    2301             :      *    secondary checkpoint if the first checkpoint failed, though we
    2302             :      *    only did this on the master anyway, not on standby. Keeping just
    2303             :      *    one checkpoint simplifies processing and reduces disk space in
    2304             :      *    many smaller databases.)
    2305             :      * b) during checkpoint, we consume checkpoint_completion_target *
    2306             :      *    number of segments consumed between checkpoints.
    2307             :      *-------
    2308             :      */
    2309       17116 :     target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
    2310        8558 :         (1.0 + CheckPointCompletionTarget);
    2311             : 
    2312             :     /* round down */
    2313        8558 :     CheckPointSegments = (int) target;
    2314             : 
    2315        8558 :     if (CheckPointSegments < 1)
    2316          14 :         CheckPointSegments = 1;
    2317        8558 : }
    2318             : 
    2319             : void
    2320        7094 : assign_max_wal_size(int newval, void *extra)
    2321             : {
    2322        7094 :     max_wal_size_mb = newval;
    2323        7094 :     CalculateCheckpointSegments();
    2324        7094 : }
    2325             : 
    2326             : void
    2327           0 : assign_checkpoint_completion_target(double newval, void *extra)
    2328             : {
    2329           0 :     CheckPointCompletionTarget = newval;
    2330           0 :     CalculateCheckpointSegments();
    2331           0 : }
    2332             : 
    2333             : /*
    2334             :  * At a checkpoint, how many WAL segments to recycle as preallocated future
    2335             :  * XLOG segments? Returns the highest segment that should be preallocated.
    2336             :  */
    2337             : static XLogSegNo
    2338         718 : XLOGfileslop(XLogRecPtr lastredoptr)
    2339             : {
    2340             :     XLogSegNo   minSegNo;
    2341             :     XLogSegNo   maxSegNo;
    2342             :     double      distance;
    2343             :     XLogSegNo   recycleSegNo;
    2344             : 
    2345             :     /*
    2346             :      * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
    2347             :      * correspond to. Always recycle enough segments to meet the minimum, and
    2348             :      * remove enough segments to stay below the maximum.
    2349             :      */
    2350        1436 :     minSegNo = lastredoptr / wal_segment_size +
    2351         718 :         ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
    2352        1436 :     maxSegNo = lastredoptr / wal_segment_size +
    2353         718 :         ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
    2354             : 
    2355             :     /*
    2356             :      * Between those limits, recycle enough segments to get us through to the
    2357             :      * estimated end of next checkpoint.
    2358             :      *
    2359             :      * To estimate where the next checkpoint will finish, assume that the
    2360             :      * system runs steadily consuming CheckPointDistanceEstimate bytes between
    2361             :      * every checkpoint.
    2362             :      */
    2363         718 :     distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
    2364             :     /* add 10% for good measure. */
    2365         718 :     distance *= 1.10;
    2366             : 
    2367         718 :     recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
    2368             :                                     wal_segment_size);
    2369             : 
    2370         718 :     if (recycleSegNo < minSegNo)
    2371          60 :         recycleSegNo = minSegNo;
    2372         718 :     if (recycleSegNo > maxSegNo)
    2373         514 :         recycleSegNo = maxSegNo;
    2374             : 
    2375         718 :     return recycleSegNo;
    2376             : }
    2377             : 
    2378             : /*
    2379             :  * Check whether we've consumed enough xlog space that a checkpoint is needed.
    2380             :  *
    2381             :  * new_segno indicates a log file that has just been filled up (or read
    2382             :  * during recovery). We measure the distance from RedoRecPtr to new_segno
    2383             :  * and see if that exceeds CheckPointSegments.
    2384             :  *
    2385             :  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
    2386             :  */
    2387             : static bool
    2388        1070 : XLogCheckpointNeeded(XLogSegNo new_segno)
    2389             : {
    2390             :     XLogSegNo   old_segno;
    2391             : 
    2392        1070 :     XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
    2393             : 
    2394        1070 :     if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
    2395         184 :         return true;
    2396         886 :     return false;
    2397             : }
    2398             : 
    2399             : /*
    2400             :  * Write and/or fsync the log at least as far as WriteRqst indicates.
    2401             :  *
    2402             :  * If flexible == true, we don't have to write as far as WriteRqst, but
    2403             :  * may stop at any convenient boundary (such as a cache or logfile boundary).
    2404             :  * This option allows us to avoid uselessly issuing multiple writes when a
    2405             :  * single one would do.
    2406             :  *
    2407             :  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
    2408             :  * must be called before grabbing the lock, to make sure the data is ready to
    2409             :  * write.
    2410             :  */
    2411             : static void
    2412      687966 : XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
    2413             : {
    2414             :     bool        ispartialpage;
    2415             :     bool        last_iteration;
    2416             :     bool        finishing_seg;
    2417             :     bool        use_existent;
    2418             :     int         curridx;
    2419             :     int         npages;
    2420             :     int         startidx;
    2421             :     uint32      startoffset;
    2422             : 
    2423             :     /* We should always be inside a critical section here */
    2424             :     Assert(CritSectionCount > 0);
    2425             : 
    2426             :     /*
    2427             :      * Update local LogwrtResult (caller probably did this already, but...)
    2428             :      */
    2429      687966 :     LogwrtResult = XLogCtl->LogwrtResult;
    2430             : 
    2431             :     /*
    2432             :      * Since successive pages in the xlog cache are consecutively allocated,
    2433             :      * we can usually gather multiple pages together and issue just one
    2434             :      * write() call.  npages is the number of pages we have determined can be
    2435             :      * written together; startidx is the cache block index of the first one,
    2436             :      * and startoffset is the file offset at which it should go. The latter
    2437             :      * two variables are only valid when npages > 0, but we must initialize
    2438             :      * all of them to keep the compiler quiet.
    2439             :      */
    2440      687966 :     npages = 0;
    2441      687966 :     startidx = 0;
    2442      687966 :     startoffset = 0;
    2443             : 
    2444             :     /*
    2445             :      * Within the loop, curridx is the cache block index of the page to
    2446             :      * consider writing.  Begin at the buffer containing the next unwritten
    2447             :      * page, or last partially written page.
    2448             :      */
    2449      687966 :     curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
    2450             : 
    2451     1607448 :     while (LogwrtResult.Write < WriteRqst.Write)
    2452             :     {
    2453             :         /*
    2454             :          * Make sure we're not ahead of the insert process.  This could happen
    2455             :          * if we're passed a bogus WriteRqst.Write that is past the end of the
    2456             :          * last page that's been initialized by AdvanceXLInsertBuffer.
    2457             :          */
    2458     1187012 :         XLogRecPtr  EndPtr = XLogCtl->xlblocks[curridx];
    2459             : 
    2460     1187012 :         if (LogwrtResult.Write >= EndPtr)
    2461           0 :             elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
    2462             :                  (uint32) (LogwrtResult.Write >> 32),
    2463             :                  (uint32) LogwrtResult.Write,
    2464             :                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
    2465             : 
    2466             :         /* Advance LogwrtResult.Write to end of current buffer page */
    2467     1187012 :         LogwrtResult.Write = EndPtr;
    2468     1187012 :         ispartialpage = WriteRqst.Write < LogwrtResult.Write;
    2469             : 
    2470     1187012 :         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    2471             :                              wal_segment_size))
    2472             :         {
    2473             :             /*
    2474             :              * Switch to new logfile segment.  We cannot have any pending
    2475             :              * pages here (since we dump what we have at segment end).
    2476             :              */
    2477             :             Assert(npages == 0);
    2478        8230 :             if (openLogFile >= 0)
    2479        1944 :                 XLogFileClose();
    2480        8230 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2481             :                             wal_segment_size);
    2482             : 
    2483             :             /* create/use new log file */
    2484        8230 :             use_existent = true;
    2485        8230 :             openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
    2486        8230 :             ReserveExternalFD();
    2487             :         }
    2488             : 
    2489             :         /* Make sure we have the current logfile open */
    2490     1187012 :         if (openLogFile < 0)
    2491             :         {
    2492           0 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2493             :                             wal_segment_size);
    2494           0 :             openLogFile = XLogFileOpen(openLogSegNo);
    2495           0 :             ReserveExternalFD();
    2496             :         }
    2497             : 
    2498             :         /* Add current page to the set of pending pages-to-dump */
    2499     1187012 :         if (npages == 0)
    2500             :         {
    2501             :             /* first of group */
    2502      689256 :             startidx = curridx;
    2503      689256 :             startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
    2504             :                                             wal_segment_size);
    2505             :         }
    2506     1187012 :         npages++;
    2507             : 
    2508             :         /*
    2509             :          * Dump the set if this will be the last loop iteration, or if we are
    2510             :          * at the last page of the cache area (since the next page won't be
    2511             :          * contiguous in memory), or if we are at the end of the logfile
    2512             :          * segment.
    2513             :          */
    2514     1187012 :         last_iteration = WriteRqst.Write <= LogwrtResult.Write;
    2515             : 
    2516     2111472 :         finishing_seg = !ispartialpage &&
    2517      924460 :             (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
    2518             : 
    2519     1187012 :         if (last_iteration ||
    2520      499264 :             curridx == XLogCtl->XLogCacheBlck ||
    2521             :             finishing_seg)
    2522             :         {
    2523             :             char       *from;
    2524             :             Size        nbytes;
    2525             :             Size        nleft;
    2526             :             int         written;
    2527             : 
    2528             :             /* OK to write the page(s) */
    2529      689256 :             from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
    2530      689256 :             nbytes = npages * (Size) XLOG_BLCKSZ;
    2531      689256 :             nleft = nbytes;
    2532             :             do
    2533             :             {
    2534      689256 :                 errno = 0;
    2535      689256 :                 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
    2536      689256 :                 written = pg_pwrite(openLogFile, from, nleft, startoffset);
    2537      689256 :                 pgstat_report_wait_end();
    2538      689256 :                 if (written <= 0)
    2539             :                 {
    2540             :                     char        xlogfname[MAXFNAMELEN];
    2541             :                     int         save_errno;
    2542             : 
    2543           0 :                     if (errno == EINTR)
    2544           0 :                         continue;
    2545             : 
    2546           0 :                     save_errno = errno;
    2547           0 :                     XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
    2548             :                                  wal_segment_size);
    2549           0 :                     errno = save_errno;
    2550           0 :                     ereport(PANIC,
    2551             :                             (errcode_for_file_access(),
    2552             :                              errmsg("could not write to log file %s "
    2553             :                                     "at offset %u, length %zu: %m",
    2554             :                                     xlogfname, startoffset, nleft)));
    2555             :                 }
    2556      689256 :                 nleft -= written;
    2557      689256 :                 from += written;
    2558      689256 :                 startoffset += written;
    2559      689256 :             } while (nleft > 0);
    2560             : 
    2561      689256 :             npages = 0;
    2562             : 
    2563             :             /*
    2564             :              * If we just wrote the whole last page of a logfile segment,
    2565             :              * fsync the segment immediately.  This avoids having to go back
    2566             :              * and re-open prior segments when an fsync request comes along
    2567             :              * later. Doing it here ensures that one and only one backend will
    2568             :              * perform this fsync.
    2569             :              *
    2570             :              * This is also the right place to notify the Archiver that the
    2571             :              * segment is ready to copy to archival storage, and to update the
    2572             :              * timer for archive_timeout, and to signal for a checkpoint if
    2573             :              * too many logfile segments have been used since the last
    2574             :              * checkpoint.
    2575             :              */
    2576      689256 :             if (finishing_seg)
    2577             :             {
    2578         890 :                 issue_xlog_fsync(openLogFile, openLogSegNo);
    2579             : 
    2580             :                 /* signal that we need to wakeup walsenders later */
    2581         890 :                 WalSndWakeupRequest();
    2582             : 
    2583         890 :                 LogwrtResult.Flush = LogwrtResult.Write;    /* end of page */
    2584             : 
    2585         890 :                 if (XLogArchivingActive())
    2586          20 :                     XLogArchiveNotifySeg(openLogSegNo);
    2587             : 
    2588         890 :                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    2589         890 :                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
    2590             : 
    2591             :                 /*
    2592             :                  * Request a checkpoint if we've consumed too much xlog since
    2593             :                  * the last one.  For speed, we first check using the local
    2594             :                  * copy of RedoRecPtr, which might be out of date; if it looks
    2595             :                  * like a checkpoint is needed, forcibly update RedoRecPtr and
    2596             :                  * recheck.
    2597             :                  */
    2598         890 :                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
    2599             :                 {
    2600          60 :                     (void) GetRedoRecPtr();
    2601          60 :                     if (XLogCheckpointNeeded(openLogSegNo))
    2602          48 :                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
    2603             :                 }
    2604             :             }
    2605             :         }
    2606             : 
    2607     1187012 :         if (ispartialpage)
    2608             :         {
    2609             :             /* Only asked to write a partial page */
    2610      262552 :             LogwrtResult.Write = WriteRqst.Write;
    2611      262552 :             break;
    2612             :         }
    2613      924460 :         curridx = NextBufIdx(curridx);
    2614             : 
    2615             :         /* If flexible, break out of loop as soon as we wrote something */
    2616      924460 :         if (flexible && npages == 0)
    2617        4978 :             break;
    2618             :     }
    2619             : 
    2620             :     Assert(npages == 0);
    2621             : 
    2622             :     /*
    2623             :      * If asked to flush, do so
    2624             :      */
    2625      687966 :     if (LogwrtResult.Flush < WriteRqst.Flush &&
    2626      263874 :         LogwrtResult.Flush < LogwrtResult.Write)
    2627             : 
    2628             :     {
    2629             :         /*
    2630             :          * Could get here without iterating above loop, in which case we might
    2631             :          * have no open file or the wrong one.  However, we do not need to
    2632             :          * fsync more than one file.
    2633             :          */
    2634      263822 :         if (sync_method != SYNC_METHOD_OPEN &&
    2635      263822 :             sync_method != SYNC_METHOD_OPEN_DSYNC)
    2636             :         {
    2637      263822 :             if (openLogFile >= 0 &&
    2638      263812 :                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    2639             :                                  wal_segment_size))
    2640           0 :                 XLogFileClose();
    2641      263822 :             if (openLogFile < 0)
    2642             :             {
    2643          10 :                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2644             :                                 wal_segment_size);
    2645          10 :                 openLogFile = XLogFileOpen(openLogSegNo);
    2646          10 :                 ReserveExternalFD();
    2647             :             }
    2648             : 
    2649      263822 :             issue_xlog_fsync(openLogFile, openLogSegNo);
    2650             :         }
    2651             : 
    2652             :         /* signal that we need to wakeup walsenders later */
    2653      263822 :         WalSndWakeupRequest();
    2654             : 
    2655      263822 :         LogwrtResult.Flush = LogwrtResult.Write;
    2656             :     }
    2657             : 
    2658             :     /*
    2659             :      * Update shared-memory status
    2660             :      *
    2661             :      * We make sure that the shared 'request' values do not fall behind the
    2662             :      * 'result' values.  This is not absolutely essential, but it saves some
    2663             :      * code in a couple of places.
    2664             :      */
    2665             :     {
    2666      687966 :         SpinLockAcquire(&XLogCtl->info_lck);
    2667      687966 :         XLogCtl->LogwrtResult = LogwrtResult;
    2668      687966 :         if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
    2669      250952 :             XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
    2670      687966 :         if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
    2671      264284 :             XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
    2672      687966 :         SpinLockRelease(&XLogCtl->info_lck);
    2673             :     }
    2674      687966 : }
    2675             : 
    2676             : /*
    2677             :  * Record the LSN for an asynchronous transaction commit/abort
    2678             :  * and nudge the WALWriter if there is work for it to do.
    2679             :  * (This should not be called for synchronous commits.)
    2680             :  */
    2681             : void
    2682       62948 : XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
    2683             : {
    2684       62948 :     XLogRecPtr  WriteRqstPtr = asyncXactLSN;
    2685             :     bool        sleeping;
    2686             : 
    2687       62948 :     SpinLockAcquire(&XLogCtl->info_lck);
    2688       62948 :     LogwrtResult = XLogCtl->LogwrtResult;
    2689       62948 :     sleeping = XLogCtl->WalWriterSleeping;
    2690       62948 :     if (XLogCtl->asyncXactLSN < asyncXactLSN)
    2691       62668 :         XLogCtl->asyncXactLSN = asyncXactLSN;
    2692       62948 :     SpinLockRelease(&XLogCtl->info_lck);
    2693             : 
    2694             :     /*
    2695             :      * If the WALWriter is sleeping, we should kick it to make it come out of
    2696             :      * low-power mode.  Otherwise, determine whether there's a full page of
    2697             :      * WAL available to write.
    2698             :      */
    2699       62948 :     if (!sleeping)
    2700             :     {
    2701             :         /* back off to last completed page boundary */
    2702       62916 :         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
    2703             : 
    2704             :         /* if we have already flushed that far, we're done */
    2705       62916 :         if (WriteRqstPtr <= LogwrtResult.Flush)
    2706       13376 :             return;
    2707             :     }
    2708             : 
    2709             :     /*
    2710             :      * Nudge the WALWriter: it has a full page of WAL to write, or we want it
    2711             :      * to come out of low-power mode so that this async commit will reach disk
    2712             :      * within the expected amount of time.
    2713             :      */
    2714       49572 :     if (ProcGlobal->walwriterLatch)
    2715       11622 :         SetLatch(ProcGlobal->walwriterLatch);
    2716             : }
    2717             : 
    2718             : /*
    2719             :  * Record the LSN up to which we can remove WAL because it's not required by
    2720             :  * any replication slot.
    2721             :  */
    2722             : void
    2723        2498 : XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
    2724             : {
    2725        2498 :     SpinLockAcquire(&XLogCtl->info_lck);
    2726        2498 :     XLogCtl->replicationSlotMinLSN = lsn;
    2727        2498 :     SpinLockRelease(&XLogCtl->info_lck);
    2728        2498 : }
    2729             : 
    2730             : 
    2731             : /*
    2732             :  * Return the oldest LSN we must retain to satisfy the needs of some
    2733             :  * replication slot.
    2734             :  */
    2735             : static XLogRecPtr
    2736        3344 : XLogGetReplicationSlotMinimumLSN(void)
    2737             : {
    2738             :     XLogRecPtr  retval;
    2739             : 
    2740        3344 :     SpinLockAcquire(&XLogCtl->info_lck);
    2741        3344 :     retval = XLogCtl->replicationSlotMinLSN;
    2742        3344 :     SpinLockRelease(&XLogCtl->info_lck);
    2743             : 
    2744        3344 :     return retval;
    2745             : }
    2746             : 
    2747             : /*
    2748             :  * Advance minRecoveryPoint in control file.
    2749             :  *
    2750             :  * If we crash during recovery, we must reach this point again before the
    2751             :  * database is consistent.
    2752             :  *
    2753             :  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
    2754             :  * is only updated if it's not already greater than or equal to 'lsn'.
    2755             :  */
    2756             : static void
    2757        4188 : UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
    2758             : {
    2759             :     /* Quick check using our local copy of the variable */
    2760        4188 :     if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
    2761        3726 :         return;
    2762             : 
    2763             :     /*
    2764             :      * An invalid minRecoveryPoint means that we need to recover all the WAL,
    2765             :      * i.e., we're doing crash recovery.  We never modify the control file's
    2766             :      * value in that case, so we can short-circuit future checks here too. The
    2767             :      * local values of minRecoveryPoint and minRecoveryPointTLI should not be
    2768             :      * updated until crash recovery finishes.  We only do this for the startup
    2769             :      * process as it should not update its own reference of minRecoveryPoint
    2770             :      * until it has finished crash recovery to make sure that all WAL
    2771             :      * available is replayed in this case.  This also saves from extra locks
    2772             :      * taken on the control file from the startup process.
    2773             :      */
    2774         462 :     if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
    2775             :     {
    2776          36 :         updateMinRecoveryPoint = false;
    2777          36 :         return;
    2778             :     }
    2779             : 
    2780         426 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    2781             : 
    2782             :     /* update local copy */
    2783         426 :     minRecoveryPoint = ControlFile->minRecoveryPoint;
    2784         426 :     minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    2785             : 
    2786         426 :     if (XLogRecPtrIsInvalid(minRecoveryPoint))
    2787           0 :         updateMinRecoveryPoint = false;
    2788         426 :     else if (force || minRecoveryPoint < lsn)
    2789             :     {
    2790             :         XLogRecPtr  newMinRecoveryPoint;
    2791             :         TimeLineID  newMinRecoveryPointTLI;
    2792             : 
    2793             :         /*
    2794             :          * To avoid having to update the control file too often, we update it
    2795             :          * all the way to the last record being replayed, even though 'lsn'
    2796             :          * would suffice for correctness.  This also allows the 'force' case
    2797             :          * to not need a valid 'lsn' value.
    2798             :          *
    2799             :          * Another important reason for doing it this way is that the passed
    2800             :          * 'lsn' value could be bogus, i.e., past the end of available WAL, if
    2801             :          * the caller got it from a corrupted heap page.  Accepting such a
    2802             :          * value as the min recovery point would prevent us from coming up at
    2803             :          * all.  Instead, we just log a warning and continue with recovery.
    2804             :          * (See also the comments about corrupt LSNs in XLogFlush.)
    2805             :          */
    2806         362 :         SpinLockAcquire(&XLogCtl->info_lck);
    2807         362 :         newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
    2808         362 :         newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
    2809         362 :         SpinLockRelease(&XLogCtl->info_lck);
    2810             : 
    2811         362 :         if (!force && newMinRecoveryPoint < lsn)
    2812           0 :             elog(WARNING,
    2813             :                  "xlog min recovery request %X/%X is past current point %X/%X",
    2814             :                  (uint32) (lsn >> 32), (uint32) lsn,
    2815             :                  (uint32) (newMinRecoveryPoint >> 32),
    2816             :                  (uint32) newMinRecoveryPoint);
    2817             : 
    2818             :         /* update control file */
    2819         362 :         if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
    2820             :         {
    2821         342 :             ControlFile->minRecoveryPoint = newMinRecoveryPoint;
    2822         342 :             ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
    2823         342 :             UpdateControlFile();
    2824         342 :             minRecoveryPoint = newMinRecoveryPoint;
    2825         342 :             minRecoveryPointTLI = newMinRecoveryPointTLI;
    2826             : 
    2827         342 :             ereport(DEBUG2,
    2828             :                     (errmsg("updated min recovery point to %X/%X on timeline %u",
    2829             :                             (uint32) (minRecoveryPoint >> 32),
    2830             :                             (uint32) minRecoveryPoint,
    2831             :                             newMinRecoveryPointTLI)));
    2832             :         }
    2833             :     }
    2834         426 :     LWLockRelease(ControlFileLock);
    2835             : }
    2836             : 
    2837             : /*
    2838             :  * Ensure that all XLOG data through the given position is flushed to disk.
    2839             :  *
    2840             :  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
    2841             :  * already held, and we try to avoid acquiring it if possible.
    2842             :  */
    2843             : void
    2844      788772 : XLogFlush(XLogRecPtr record)
    2845             : {
    2846             :     XLogRecPtr  WriteRqstPtr;
    2847             :     XLogwrtRqst WriteRqst;
    2848             : 
    2849             :     /*
    2850             :      * During REDO, we are reading not writing WAL.  Therefore, instead of
    2851             :      * trying to flush the WAL, we should update minRecoveryPoint instead. We
    2852             :      * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
    2853             :      * to act this way too, and because when it tries to write the
    2854             :      * end-of-recovery checkpoint, it should indeed flush.
    2855             :      */
    2856      788772 :     if (!XLogInsertAllowed())
    2857             :     {
    2858        4094 :         UpdateMinRecoveryPoint(record, false);
    2859      524754 :         return;
    2860             :     }
    2861             : 
    2862             :     /* Quick exit if already known flushed */
    2863      784678 :     if (record <= LogwrtResult.Flush)
    2864      520660 :         return;
    2865             : 
    2866             : #ifdef WAL_DEBUG
    2867             :     if (XLOG_DEBUG)
    2868             :         elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
    2869             :              (uint32) (record >> 32), (uint32) record,
    2870             :              (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
    2871             :              (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
    2872             : #endif
    2873             : 
    2874      264018 :     START_CRIT_SECTION();
    2875             : 
    2876             :     /*
    2877             :      * Since fsync is usually a horribly expensive operation, we try to
    2878             :      * piggyback as much data as we can on each fsync: if we see any more data
    2879             :      * entered into the xlog buffer, we'll write and fsync that too, so that
    2880             :      * the final value of LogwrtResult.Flush is as large as possible. This
    2881             :      * gives us some chance of avoiding another fsync immediately after.
    2882             :      */
    2883             : 
    2884             :     /* initialize to given target; may increase below */
    2885      264018 :     WriteRqstPtr = record;
    2886             : 
    2887             :     /*
    2888             :      * Now wait until we get the write lock, or someone else does the flush
    2889             :      * for us.
    2890             :      */
    2891             :     for (;;)
    2892         610 :     {
    2893             :         XLogRecPtr  insertpos;
    2894             : 
    2895             :         /* read LogwrtResult and update local state */
    2896      264628 :         SpinLockAcquire(&XLogCtl->info_lck);
    2897      264628 :         if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
    2898        2112 :             WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
    2899      264628 :         LogwrtResult = XLogCtl->LogwrtResult;
    2900      264628 :         SpinLockRelease(&XLogCtl->info_lck);
    2901             : 
    2902             :         /* done already? */
    2903      264628 :         if (record <= LogwrtResult.Flush)
    2904        1384 :             break;
    2905             : 
    2906             :         /*
    2907             :          * Before actually performing the write, wait for all in-flight
    2908             :          * insertions to the pages we're about to write to finish.
    2909             :          */
    2910      263244 :         insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
    2911             : 
    2912             :         /*
    2913             :          * Try to get the write lock. If we can't get it immediately, wait
    2914             :          * until it's released, and recheck if we still need to do the flush
    2915             :          * or if the backend that held the lock did it for us already. This
    2916             :          * helps to maintain a good rate of group committing when the system
    2917             :          * is bottlenecked by the speed of fsyncing.
    2918             :          */
    2919      263244 :         if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
    2920             :         {
    2921             :             /*
    2922             :              * The lock is now free, but we didn't acquire it yet. Before we
    2923             :              * do, loop back to check if someone else flushed the record for
    2924             :              * us already.
    2925             :              */
    2926         610 :             continue;
    2927             :         }
    2928             : 
    2929             :         /* Got the lock; recheck whether request is satisfied */
    2930      262634 :         LogwrtResult = XLogCtl->LogwrtResult;
    2931      262634 :         if (record <= LogwrtResult.Flush)
    2932             :         {
    2933         138 :             LWLockRelease(WALWriteLock);
    2934         138 :             break;
    2935             :         }
    2936             : 
    2937             :         /*
    2938             :          * Sleep before flush! By adding a delay here, we may give further
    2939             :          * backends the opportunity to join the backlog of group commit
    2940             :          * followers; this can significantly improve transaction throughput,
    2941             :          * at the risk of increasing transaction latency.
    2942             :          *
    2943             :          * We do not sleep if enableFsync is not turned on, nor if there are
    2944             :          * fewer than CommitSiblings other backends with active transactions.
    2945             :          */
    2946      262496 :         if (CommitDelay > 0 && enableFsync &&
    2947           0 :             MinimumActiveBackends(CommitSiblings))
    2948             :         {
    2949           0 :             pg_usleep(CommitDelay);
    2950             : 
    2951             :             /*
    2952             :              * Re-check how far we can now flush the WAL. It's generally not
    2953             :              * safe to call WaitXLogInsertionsToFinish while holding
    2954             :              * WALWriteLock, because an in-progress insertion might need to
    2955             :              * also grab WALWriteLock to make progress. But we know that all
    2956             :              * the insertions up to insertpos have already finished, because
    2957             :              * that's what the earlier WaitXLogInsertionsToFinish() returned.
    2958             :              * We're only calling it again to allow insertpos to be moved
    2959             :              * further forward, not to actually wait for anyone.
    2960             :              */
    2961           0 :             insertpos = WaitXLogInsertionsToFinish(insertpos);
    2962             :         }
    2963             : 
    2964             :         /* try to write/flush later additions to XLOG as well */
    2965      262496 :         WriteRqst.Write = insertpos;
    2966      262496 :         WriteRqst.Flush = insertpos;
    2967             : 
    2968      262496 :         XLogWrite(WriteRqst, false);
    2969             : 
    2970      262496 :         LWLockRelease(WALWriteLock);
    2971             :         /* done */
    2972      262496 :         break;
    2973             :     }
    2974             : 
    2975      264018 :     END_CRIT_SECTION();
    2976             : 
    2977             :     /* wake up walsenders now that we've released heavily contended locks */
    2978      264018 :     WalSndWakeupProcessRequests();
    2979             : 
    2980             :     /*
    2981             :      * If we still haven't flushed to the request point then we have a
    2982             :      * problem; most likely, the requested flush point is past end of XLOG.
    2983             :      * This has been seen to occur when a disk page has a corrupted LSN.
    2984             :      *
    2985             :      * Formerly we treated this as a PANIC condition, but that hurts the
    2986             :      * system's robustness rather than helping it: we do not want to take down
    2987             :      * the whole system due to corruption on one data page.  In particular, if
    2988             :      * the bad page is encountered again during recovery then we would be
    2989             :      * unable to restart the database at all!  (This scenario actually
    2990             :      * happened in the field several times with 7.1 releases.)  As of 8.4, bad
    2991             :      * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
    2992             :      * the only time we can reach here during recovery is while flushing the
    2993             :      * end-of-recovery checkpoint record, and we don't expect that to have a
    2994             :      * bad LSN.
    2995             :      *
    2996             :      * Note that for calls from xact.c, the ERROR will be promoted to PANIC
    2997             :      * since xact.c calls this routine inside a critical section.  However,
    2998             :      * calls from bufmgr.c are not within critical sections and so we will not
    2999             :      * force a restart for a bad LSN on a data page.
    3000             :      */
    3001      264018 :     if (LogwrtResult.Flush < record)
    3002           0 :         elog(ERROR,
    3003             :              "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
    3004             :              (uint32) (record >> 32), (uint32) record,
    3005             :              (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
    3006             : }
    3007             : 
    3008             : /*
    3009             :  * Write & flush xlog, but without specifying exactly where to.
    3010             :  *
    3011             :  * We normally write only completed blocks; but if there is nothing to do on
    3012             :  * that basis, we check for unwritten async commits in the current incomplete
    3013             :  * block, and write through the latest one of those.  Thus, if async commits
    3014             :  * are not being used, we will write complete blocks only.
    3015             :  *
    3016             :  * If, based on the above, there's anything to write we do so immediately. But
    3017             :  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
    3018             :  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
    3019             :  * more than wal_writer_flush_after unflushed blocks.
    3020             :  *
    3021             :  * We can guarantee that async commits reach disk after at most three
    3022             :  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
    3023             :  * to write "flexibly", meaning it can stop at the end of the buffer ring;
    3024             :  * this makes a difference only with very high load or long wal_writer_delay,
    3025             :  * but imposes one extra cycle for the worst case for async commits.)
    3026             :  *
    3027             :  * This routine is invoked periodically by the background walwriter process.
    3028             :  *
    3029             :  * Returns true if there was any work to do, even if we skipped flushing due
    3030             :  * to wal_writer_delay/wal_writer_flush_after.
    3031             :  */
    3032             : bool
    3033       55000 : XLogBackgroundFlush(void)
    3034             : {
    3035             :     XLogwrtRqst WriteRqst;
    3036       55000 :     bool        flexible = true;
    3037             :     static TimestampTz lastflush;
    3038             :     TimestampTz now;
    3039             :     int         flushbytes;
    3040             : 
    3041             :     /* XLOG doesn't need flushing during recovery */
    3042       55000 :     if (RecoveryInProgress())
    3043           0 :         return false;
    3044             : 
    3045             :     /* read LogwrtResult and update local state */
    3046       55000 :     SpinLockAcquire(&XLogCtl->info_lck);
    3047       55000 :     LogwrtResult = XLogCtl->LogwrtResult;
    3048       55000 :     WriteRqst = XLogCtl->LogwrtRqst;
    3049       55000 :     SpinLockRelease(&XLogCtl->info_lck);
    3050             : 
    3051             :     /* back off to last completed page boundary */
    3052       55000 :     WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
    3053             : 
    3054             :     /* if we have already flushed that far, consider async commit records */
    3055       55000 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    3056             :     {
    3057       46096 :         SpinLockAcquire(&XLogCtl->info_lck);
    3058       46096 :         WriteRqst.Write = XLogCtl->asyncXactLSN;
    3059       46096 :         SpinLockRelease(&XLogCtl->info_lck);
    3060       46096 :         flexible = false;       /* ensure it all gets written */
    3061             :     }
    3062             : 
    3063             :     /*
    3064             :      * If already known flushed, we're done. Just need to check if we are
    3065             :      * holding an open file handle to a logfile that's no longer in use,
    3066             :      * preventing the file from being deleted.
    3067             :      */
    3068       55000 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    3069             :     {
    3070       45650 :         if (openLogFile >= 0)
    3071             :         {
    3072        4706 :             if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    3073             :                                  wal_segment_size))
    3074             :             {
    3075          98 :                 XLogFileClose();
    3076             :             }
    3077             :         }
    3078       45650 :         return false;
    3079             :     }
    3080             : 
    3081             :     /*
    3082             :      * Determine how far to flush WAL, based on the wal_writer_delay and
    3083             :      * wal_writer_flush_after GUCs.
    3084             :      */
    3085        9350 :     now = GetCurrentTimestamp();
    3086        9350 :     flushbytes =
    3087        9350 :         WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
    3088             : 
    3089        9350 :     if (WalWriterFlushAfter == 0 || lastflush == 0)
    3090             :     {
    3091             :         /* first call, or block based limits disabled */
    3092         164 :         WriteRqst.Flush = WriteRqst.Write;
    3093         164 :         lastflush = now;
    3094             :     }
    3095        9186 :     else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
    3096             :     {
    3097             :         /*
    3098             :          * Flush the writes at least every WalWriterDelay ms. This is
    3099             :          * important to bound the amount of time it takes for an asynchronous
    3100             :          * commit to hit disk.
    3101             :          */
    3102        1526 :         WriteRqst.Flush = WriteRqst.Write;
    3103        1526 :         lastflush = now;
    3104             :     }
    3105        7660 :     else if (flushbytes >= WalWriterFlushAfter)
    3106             :     {
    3107             :         /* exceeded wal_writer_flush_after blocks, flush */
    3108           0 :         WriteRqst.Flush = WriteRqst.Write;
    3109           0 :         lastflush = now;
    3110             :     }
    3111             :     else
    3112             :     {
    3113             :         /* no flushing, this time round */
    3114        7660 :         WriteRqst.Flush = 0;
    3115             :     }
    3116             : 
    3117             : #ifdef WAL_DEBUG
    3118             :     if (XLOG_DEBUG)
    3119             :         elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
    3120             :              (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
    3121             :              (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
    3122             :              (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
    3123             :              (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
    3124             : #endif
    3125             : 
    3126        9350 :     START_CRIT_SECTION();
    3127             : 
    3128             :     /* now wait for any in-progress insertions to finish and get write lock */
    3129        9350 :     WaitXLogInsertionsToFinish(WriteRqst.Write);
    3130        9350 :     LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    3131        9350 :     LogwrtResult = XLogCtl->LogwrtResult;
    3132        9350 :     if (WriteRqst.Write > LogwrtResult.Write ||
    3133        3930 :         WriteRqst.Flush > LogwrtResult.Flush)
    3134             :     {
    3135        5476 :         XLogWrite(WriteRqst, flexible);
    3136             :     }
    3137        9350 :     LWLockRelease(WALWriteLock);
    3138             : 
    3139        9350 :     END_CRIT_SECTION();
    3140             : 
    3141             :     /* wake up walsenders now that we've released heavily contended locks */
    3142        9350 :     WalSndWakeupProcessRequests();
    3143             : 
    3144             :     /*
    3145             :      * Great, done. To take some work off the critical path, try to initialize
    3146             :      * as many of the no-longer-needed WAL buffers for future use as we can.
    3147             :      */
    3148        9350 :     AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
    3149             : 
    3150             :     /*
    3151             :      * If we determined that we need to write data, but somebody else
    3152             :      * wrote/flushed already, it should be considered as being active, to
    3153             :      * avoid hibernating too early.
    3154             :      */
    3155        9350 :     return true;
    3156             : }
    3157             : 
    3158             : /*
    3159             :  * Test whether XLOG data has been flushed up to (at least) the given position.
    3160             :  *
    3161             :  * Returns true if a flush is still needed.  (It may be that someone else
    3162             :  * is already in process of flushing that far, however.)
    3163             :  */
    3164             : bool
    3165    16995374 : XLogNeedsFlush(XLogRecPtr record)
    3166             : {
    3167             :     /*
    3168             :      * During recovery, we don't flush WAL but update minRecoveryPoint
    3169             :      * instead. So "needs flush" is taken to mean whether minRecoveryPoint
    3170             :      * would need to be updated.
    3171             :      */
    3172    16995374 :     if (RecoveryInProgress())
    3173             :     {
    3174             :         /*
    3175             :          * An invalid minRecoveryPoint means that we need to recover all the
    3176             :          * WAL, i.e., we're doing crash recovery.  We never modify the control
    3177             :          * file's value in that case, so we can short-circuit future checks
    3178             :          * here too.  This triggers a quick exit path for the startup process,
    3179             :          * which cannot update its local copy of minRecoveryPoint as long as
    3180             :          * it has not replayed all WAL available when doing crash recovery.
    3181             :          */
    3182      132956 :         if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
    3183           0 :             updateMinRecoveryPoint = false;
    3184             : 
    3185             :         /* Quick exit if already known to be updated or cannot be updated */
    3186      132956 :         if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
    3187      103700 :             return false;
    3188             : 
    3189             :         /*
    3190             :          * Update local copy of minRecoveryPoint. But if the lock is busy,
    3191             :          * just return a conservative guess.
    3192             :          */
    3193       29256 :         if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
    3194           0 :             return true;
    3195       29256 :         minRecoveryPoint = ControlFile->minRecoveryPoint;
    3196       29256 :         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    3197       29256 :         LWLockRelease(ControlFileLock);
    3198             : 
    3199             :         /*
    3200             :          * Check minRecoveryPoint for any other process than the startup
    3201             :          * process doing crash recovery, which should not update the control
    3202             :          * file value if crash recovery is still running.
    3203             :          */
    3204       29256 :         if (XLogRecPtrIsInvalid(minRecoveryPoint))
    3205           0 :             updateMinRecoveryPoint = false;
    3206             : 
    3207             :         /* check again */
    3208       29256 :         if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
    3209          28 :             return false;
    3210             :         else
    3211       29228 :             return true;
    3212             :     }
    3213             : 
    3214             :     /* Quick exit if already known flushed */
    3215    16862418 :     if (record <= LogwrtResult.Flush)
    3216    16783410 :         return false;
    3217             : 
    3218             :     /* read LogwrtResult and update local state */
    3219       79008 :     SpinLockAcquire(&XLogCtl->info_lck);
    3220       79008 :     LogwrtResult = XLogCtl->LogwrtResult;
    3221       79008 :     SpinLockRelease(&XLogCtl->info_lck);
    3222             : 
    3223             :     /* check again */
    3224       79008 :     if (record <= LogwrtResult.Flush)
    3225        1056 :         return false;
    3226             : 
    3227       77952 :     return true;
    3228             : }
    3229             : 
    3230             : /*
    3231             :  * Create a new XLOG file segment, or open a pre-existing one.
    3232             :  *
    3233             :  * logsegno: identify segment to be created/opened.
    3234             :  *
    3235             :  * *use_existent: if true, OK to use a pre-existing file (else, any
    3236             :  * pre-existing file will be deleted).  On return, true if a pre-existing
    3237             :  * file was used.
    3238             :  *
    3239             :  * use_lock: if true, acquire ControlFileLock while moving file into
    3240             :  * place.  This should be true except during bootstrap log creation.  The
    3241             :  * caller must *not* hold the lock at call.
    3242             :  *
    3243             :  * Returns FD of opened file.
    3244             :  *
    3245             :  * Note: errors here are ERROR not PANIC because we might or might not be
    3246             :  * inside a critical section (eg, during checkpoint there is no reason to
    3247             :  * take down the system on failure).  They will promote to PANIC if we are
    3248             :  * in a critical section.
    3249             :  */
    3250             : int
    3251        8782 : XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
    3252             : {
    3253             :     char        path[MAXPGPATH];
    3254             :     char        tmppath[MAXPGPATH];
    3255             :     PGAlignedXLogBlock zbuffer;
    3256             :     XLogSegNo   installed_segno;
    3257             :     XLogSegNo   max_segno;
    3258             :     int         fd;
    3259             :     int         nbytes;
    3260             :     int         save_errno;
    3261             : 
    3262        8782 :     XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
    3263             : 
    3264             :     /*
    3265             :      * Try to use existent file (checkpoint maker may have created it already)
    3266             :      */
    3267        8782 :     if (*use_existent)
    3268             :     {
    3269        8424 :         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
    3270        8424 :         if (fd < 0)
    3271             :         {
    3272         418 :             if (errno != ENOENT)
    3273           0 :                 ereport(ERROR,
    3274             :                         (errcode_for_file_access(),
    3275             :                          errmsg("could not open file \"%s\": %m", path)));
    3276             :         }
    3277             :         else
    3278        8006 :             return fd;
    3279             :     }
    3280             : 
    3281             :     /*
    3282             :      * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
    3283             :      * another process is doing the same thing.  If so, we will end up
    3284             :      * pre-creating an extra log segment.  That seems OK, and better than
    3285             :      * holding the lock throughout this lengthy process.
    3286             :      */
    3287         776 :     elog(DEBUG2, "creating and filling new WAL file");
    3288             : 
    3289         776 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3290             : 
    3291         776 :     unlink(tmppath);
    3292             : 
    3293             :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    3294         776 :     fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
    3295         776 :     if (fd < 0)
    3296           0 :         ereport(ERROR,
    3297             :                 (errcode_for_file_access(),
    3298             :                  errmsg("could not create file \"%s\": %m", tmppath)));
    3299             : 
    3300         776 :     memset(zbuffer.data, 0, XLOG_BLCKSZ);
    3301             : 
    3302         776 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
    3303         776 :     save_errno = 0;
    3304         776 :     if (wal_init_zero)
    3305             :     {
    3306             :         /*
    3307             :          * Zero-fill the file.  With this setting, we do this the hard way to
    3308             :          * ensure that all the file space has really been allocated.  On
    3309             :          * platforms that allow "holes" in files, just seeking to the end
    3310             :          * doesn't allocate intermediate space.  This way, we know that we
    3311             :          * have all the space and (after the fsync below) that all the
    3312             :          * indirect blocks are down on disk.  Therefore, fdatasync(2) or
    3313             :          * O_DSYNC will be sufficient to sync future writes to the log file.
    3314             :          */
    3315     1152264 :         for (nbytes = 0; nbytes < wal_segment_size; nbytes += XLOG_BLCKSZ)
    3316             :         {
    3317     1151488 :             errno = 0;
    3318     1151488 :             if (write(fd, zbuffer.data, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    3319             :             {
    3320             :                 /* if write didn't set errno, assume no disk space */
    3321           0 :                 save_errno = errno ? errno : ENOSPC;
    3322           0 :                 break;
    3323             :             }
    3324             :         }
    3325             :     }
    3326             :     else
    3327             :     {
    3328             :         /*
    3329             :          * Otherwise, seeking to the end and writing a solitary byte is
    3330             :          * enough.
    3331             :          */
    3332           0 :         errno = 0;
    3333           0 :         if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
    3334             :         {
    3335             :             /* if write didn't set errno, assume no disk space */
    3336           0 :             save_errno = errno ? errno : ENOSPC;
    3337             :         }
    3338             :     }
    3339         776 :     pgstat_report_wait_end();
    3340             : 
    3341         776 :     if (save_errno)
    3342             :     {
    3343             :         /*
    3344             :          * If we fail to make the file, delete it to release disk space
    3345             :          */
    3346           0 :         unlink(tmppath);
    3347             : 
    3348           0 :         close(fd);
    3349             : 
    3350           0 :         errno = save_errno;
    3351             : 
    3352           0 :         ereport(ERROR,
    3353             :                 (errcode_for_file_access(),
    3354             :                  errmsg("could not write to file \"%s\": %m", tmppath)));
    3355             :     }
    3356             : 
    3357         776 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
    3358         776 :     if (pg_fsync(fd) != 0)
    3359             :     {
    3360           0 :         int         save_errno = errno;
    3361             : 
    3362           0 :         close(fd);
    3363           0 :         errno = save_errno;
    3364           0 :         ereport(ERROR,
    3365             :                 (errcode_for_file_access(),
    3366             :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3367             :     }
    3368         776 :     pgstat_report_wait_end();
    3369             : 
    3370         776 :     if (close(fd) != 0)
    3371           0 :         ereport(ERROR,
    3372             :                 (errcode_for_file_access(),
    3373             :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3374             : 
    3375             :     /*
    3376             :      * Now move the segment into place with its final name.
    3377             :      *
    3378             :      * If caller didn't want to use a pre-existing file, get rid of any
    3379             :      * pre-existing file.  Otherwise, cope with possibility that someone else
    3380             :      * has created the file while we were filling ours: if so, use ours to
    3381             :      * pre-create a future log segment.
    3382             :      */
    3383         776 :     installed_segno = logsegno;
    3384             : 
    3385             :     /*
    3386             :      * XXX: What should we use as max_segno? We used to use XLOGfileslop when
    3387             :      * that was a constant, but that was always a bit dubious: normally, at a
    3388             :      * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
    3389             :      * here, it was the offset from the insert location. We can't do the
    3390             :      * normal XLOGfileslop calculation here because we don't have access to
    3391             :      * the prior checkpoint's redo location. So somewhat arbitrarily, just use
    3392             :      * CheckPointSegments.
    3393             :      */
    3394         776 :     max_segno = logsegno + CheckPointSegments;
    3395         776 :     if (!InstallXLogFileSegment(&installed_segno, tmppath,
    3396         776 :                                 *use_existent, max_segno,
    3397             :                                 use_lock))
    3398             :     {
    3399             :         /*
    3400             :          * No need for any more future segments, or InstallXLogFileSegment()
    3401             :          * failed to rename the file into place. If the rename failed, opening
    3402             :          * the file below will fail.
    3403             :          */
    3404           0 :         unlink(tmppath);
    3405             :     }
    3406             : 
    3407             :     /* Set flag to tell caller there was no existent file */
    3408         776 :     *use_existent = false;
    3409             : 
    3410             :     /* Now open original target segment (might not be file I just made) */
    3411         776 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
    3412         776 :     if (fd < 0)
    3413           0 :         ereport(ERROR,
    3414             :                 (errcode_for_file_access(),
    3415             :                  errmsg("could not open file \"%s\": %m", path)));
    3416             : 
    3417         776 :     elog(DEBUG2, "done creating and filling new WAL file");
    3418             : 
    3419         776 :     return fd;
    3420             : }
    3421             : 
    3422             : /*
    3423             :  * Create a new XLOG file segment by copying a pre-existing one.
    3424             :  *
    3425             :  * destsegno: identify segment to be created.
    3426             :  *
    3427             :  * srcTLI, srcsegno: identify segment to be copied (could be from
    3428             :  *      a different timeline)
    3429             :  *
    3430             :  * upto: how much of the source file to copy (the rest is filled with
    3431             :  *      zeros)
    3432             :  *
    3433             :  * Currently this is only used during recovery, and so there are no locking
    3434             :  * considerations.  But we should be just as tense as XLogFileInit to avoid
    3435             :  * emplacing a bogus file.
    3436             :  */
    3437             : static void
    3438          36 : XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
    3439             :              int upto)
    3440             : {
    3441             :     char        path[MAXPGPATH];
    3442             :     char        tmppath[MAXPGPATH];
    3443             :     PGAlignedXLogBlock buffer;
    3444             :     int         srcfd;
    3445             :     int         fd;
    3446             :     int         nbytes;
    3447             : 
    3448             :     /*
    3449             :      * Open the source file
    3450             :      */
    3451          36 :     XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
    3452          36 :     srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
    3453          36 :     if (srcfd < 0)
    3454           0 :         ereport(ERROR,
    3455             :                 (errcode_for_file_access(),
    3456             :                  errmsg("could not open file \"%s\": %m", path)));
    3457             : 
    3458             :     /*
    3459             :      * Copy into a temp file name.
    3460             :      */
    3461          36 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3462             : 
    3463          36 :     unlink(tmppath);
    3464             : 
    3465             :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    3466          36 :     fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
    3467          36 :     if (fd < 0)
    3468           0 :         ereport(ERROR,
    3469             :                 (errcode_for_file_access(),
    3470             :                  errmsg("could not create file \"%s\": %m", tmppath)));
    3471             : 
    3472             :     /*
    3473             :      * Do the data copying.
    3474             :      */
    3475       73764 :     for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
    3476             :     {
    3477             :         int         nread;
    3478             : 
    3479       73728 :         nread = upto - nbytes;
    3480             : 
    3481             :         /*
    3482             :          * The part that is not read from the source file is filled with
    3483             :          * zeros.
    3484             :          */
    3485       73728 :         if (nread < sizeof(buffer))
    3486          36 :             memset(buffer.data, 0, sizeof(buffer));
    3487             : 
    3488       73728 :         if (nread > 0)
    3489             :         {
    3490             :             int         r;
    3491             : 
    3492        1740 :             if (nread > sizeof(buffer))
    3493        1704 :                 nread = sizeof(buffer);
    3494        1740 :             pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
    3495        1740 :             r = read(srcfd, buffer.data, nread);
    3496        1740 :             if (r != nread)
    3497             :             {
    3498           0 :                 if (r < 0)
    3499           0 :                     ereport(ERROR,
    3500             :                             (errcode_for_file_access(),
    3501             :                              errmsg("could not read file \"%s\": %m",
    3502             :                                     path)));
    3503             :                 else
    3504           0 :                     ereport(ERROR,
    3505             :                             (errcode(ERRCODE_DATA_CORRUPTED),
    3506             :                              errmsg("could not read file \"%s\": read %d of %zu",
    3507             :                                     path, r, (Size) nread)));
    3508             :             }
    3509        1740 :             pgstat_report_wait_end();
    3510             :         }
    3511       73728 :         errno = 0;
    3512       73728 :         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
    3513       73728 :         if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
    3514             :         {
    3515           0 :             int         save_errno = errno;
    3516             : 
    3517             :             /*
    3518             :              * If we fail to make the file, delete it to release disk space
    3519             :              */
    3520           0 :             unlink(tmppath);
    3521             :             /* if write didn't set errno, assume problem is no disk space */
    3522           0 :             errno = save_errno ? save_errno : ENOSPC;
    3523             : 
    3524           0 :             ereport(ERROR,
    3525             :                     (errcode_for_file_access(),
    3526             :                      errmsg("could not write to file \"%s\": %m", tmppath)));
    3527             :         }
    3528       73728 :         pgstat_report_wait_end();
    3529             :     }
    3530             : 
    3531          36 :     pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
    3532          36 :     if (pg_fsync(fd) != 0)
    3533           0 :         ereport(data_sync_elevel(ERROR),
    3534             :                 (errcode_for_file_access(),
    3535             :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3536          36 :     pgstat_report_wait_end();
    3537             : 
    3538          36 :     if (CloseTransientFile(fd) != 0)
    3539           0 :         ereport(ERROR,
    3540             :                 (errcode_for_file_access(),
    3541             :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3542             : 
    3543          36 :     if (CloseTransientFile(srcfd) != 0)
    3544           0 :         ereport(ERROR,
    3545             :                 (errcode_for_file_access(),
    3546             :                  errmsg("could not close file \"%s\": %m", path)));
    3547             : 
    3548             :     /*
    3549             :      * Now move the segment into place with its final name.
    3550             :      */
    3551          36 :     if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
    3552           0 :         elog(ERROR, "InstallXLogFileSegment should not have failed");
    3553          36 : }
    3554             : 
    3555             : /*
    3556             :  * Install a new XLOG segment file as a current or future log segment.
    3557             :  *
    3558             :  * This is used both to install a newly-created segment (which has a temp
    3559             :  * filename while it's being created) and to recycle an old segment.
    3560             :  *
    3561             :  * *segno: identify segment to install as (or first possible target).
    3562             :  * When find_free is true, this is modified on return to indicate the
    3563             :  * actual installation location or last segment searched.
    3564             :  *
    3565             :  * tmppath: initial name of file to install.  It will be renamed into place.
    3566             :  *
    3567             :  * find_free: if true, install the new segment at the first empty segno
    3568             :  * number at or after the passed numbers.  If false, install the new segment
    3569             :  * exactly where specified, deleting any existing segment file there.
    3570             :  *
    3571             :  * max_segno: maximum segment number to install the new file as.  Fail if no
    3572             :  * free slot is found between *segno and max_segno. (Ignored when find_free
    3573             :  * is false.)
    3574             :  *
    3575             :  * use_lock: if true, acquire ControlFileLock while moving file into
    3576             :  * place.  This should be true except during bootstrap log creation.  The
    3577             :  * caller must *not* hold the lock at call.
    3578             :  *
    3579             :  * Returns true if the file was installed successfully.  false indicates that
    3580             :  * max_segno limit was exceeded, or an error occurred while renaming the
    3581             :  * file into place.
    3582             :  */
    3583             : static bool
    3584        1544 : InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
    3585             :                        bool find_free, XLogSegNo max_segno,
    3586             :                        bool use_lock)
    3587             : {
    3588             :     char        path[MAXPGPATH];
    3589             :     struct stat stat_buf;
    3590             : 
    3591        1544 :     XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
    3592             : 
    3593             :     /*
    3594             :      * We want to be sure that only one process does this at a time.
    3595             :      */
    3596        1544 :     if (use_lock)
    3597        1150 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    3598             : 
    3599        1544 :     if (!find_free)
    3600             :     {
    3601             :         /* Force installation: get rid of any pre-existing segment file */
    3602         394 :         durable_unlink(path, DEBUG1);
    3603             :     }
    3604             :     else
    3605             :     {
    3606             :         /* Find a free slot to put it in */
    3607       13480 :         while (stat(path, &stat_buf) == 0)
    3608             :         {
    3609       12350 :             if ((*segno) >= max_segno)
    3610             :             {
    3611             :                 /* Failed to find a free slot within specified range */
    3612          20 :                 if (use_lock)
    3613          20 :                     LWLockRelease(ControlFileLock);
    3614          20 :                 return false;
    3615             :             }
    3616       12330 :             (*segno)++;
    3617       12330 :             XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
    3618             :         }
    3619             :     }
    3620             : 
    3621             :     /*
    3622             :      * Perform the rename using link if available, paranoidly trying to avoid
    3623             :      * overwriting an existing file (there shouldn't be one).
    3624             :      */
    3625        1524 :     if (durable_rename_excl(tmppath, path, LOG) != 0)
    3626             :     {
    3627           0 :         if (use_lock)
    3628           0 :             LWLockRelease(ControlFileLock);
    3629             :         /* durable_rename_excl already emitted log message */
    3630           0 :         return false;
    3631             :     }
    3632             : 
    3633        1524 :     if (use_lock)
    3634        1130 :         LWLockRelease(ControlFileLock);
    3635             : 
    3636        1524 :     return true;
    3637             : }
    3638             : 
    3639             : /*
    3640             :  * Open a pre-existing logfile segment for writing.
    3641             :  */
    3642             : int
    3643          10 : XLogFileOpen(XLogSegNo segno)
    3644             : {
    3645             :     char        path[MAXPGPATH];
    3646             :     int         fd;
    3647             : 
    3648          10 :     XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
    3649             : 
    3650          10 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
    3651          10 :     if (fd < 0)
    3652           0 :         ereport(PANIC,
    3653             :                 (errcode_for_file_access(),
    3654             :                  errmsg("could not open file \"%s\": %m", path)));
    3655             : 
    3656          10 :     return fd;
    3657             : }
    3658             : 
    3659             : /*
    3660             :  * Open a logfile segment for reading (during recovery).
    3661             :  *
    3662             :  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
    3663             :  * Otherwise, it's assumed to be already available in pg_wal.
    3664             :  */
    3665             : static int
    3666        2308 : XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
    3667             :              XLogSource source, bool notfoundOk)
    3668             : {
    3669             :     char        xlogfname[MAXFNAMELEN];
    3670             :     char        activitymsg[MAXFNAMELEN + 16];
    3671             :     char        path[MAXPGPATH];
    3672             :     int         fd;
    3673             : 
    3674        2308 :     XLogFileName(xlogfname, tli, segno, wal_segment_size);
    3675             : 
    3676        2308 :     switch (source)
    3677             :     {
    3678         356 :         case XLOG_FROM_ARCHIVE:
    3679             :             /* Report recovery progress in PS display */
    3680         356 :             snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
    3681             :                      xlogfname);
    3682         356 :             set_ps_display(activitymsg);
    3683             : 
    3684         356 :             restoredFromArchive = RestoreArchivedFile(path, xlogfname,
    3685             :                                                       "RECOVERYXLOG",
    3686             :                                                       wal_segment_size,
    3687             :                                                       InRedo);
    3688         356 :             if (!restoredFromArchive)
    3689         338 :                 return -1;
    3690          18 :             break;
    3691             : 
    3692        1952 :         case XLOG_FROM_PG_WAL:
    3693             :         case XLOG_FROM_STREAM:
    3694        1952 :             XLogFilePath(path, tli, segno, wal_segment_size);
    3695        1952 :             restoredFromArchive = false;
    3696        1952 :             break;
    3697             : 
    3698           0 :         default:
    3699           0 :             elog(ERROR, "invalid XLogFileRead source %d", source);
    3700             :     }
    3701             : 
    3702             :     /*
    3703             :      * If the segment was fetched from archival storage, replace the existing
    3704             :      * xlog segment (if any) with the archival version.
    3705             :      */
    3706        1970 :     if (source == XLOG_FROM_ARCHIVE)
    3707             :     {
    3708          18 :         KeepFileRestoredFromArchive(path, xlogfname);
    3709             : 
    3710             :         /*
    3711             :          * Set path to point at the new file in pg_wal.
    3712             :          */
    3713          18 :         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
    3714             :     }
    3715             : 
    3716        1970 :     fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
    3717        1970 :     if (fd >= 0)
    3718             :     {
    3719             :         /* Success! */
    3720        1856 :         curFileTLI = tli;
    3721             : 
    3722             :         /* Report recovery progress in PS display */
    3723        1856 :         snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
    3724             :                  xlogfname);
    3725        1856 :         set_ps_display(activitymsg);
    3726             : 
    3727             :         /* Track source of data in assorted state variables */
    3728        1856 :         readSource = source;
    3729        1856 :         XLogReceiptSource = source;
    3730             :         /* In FROM_STREAM case, caller tracks receipt time, not me */
    3731        1856 :         if (source != XLOG_FROM_STREAM)
    3732        1708 :             XLogReceiptTime = GetCurrentTimestamp();
    3733             : 
    3734        1856 :         return fd;
    3735             :     }
    3736         114 :     if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
    3737           0 :         ereport(PANIC,
    3738             :                 (errcode_for_file_access(),
    3739             :                  errmsg("could not open file \"%s\": %m", path)));
    3740         114 :     return -1;
    3741             : }
    3742             : 
    3743             : /*
    3744             :  * Open a logfile segment for reading (during recovery).
    3745             :  *
    3746             :  * This version searches for the segment with any TLI listed in expectedTLEs.
    3747             :  */
    3748             : static int
    3749        1810 : XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
    3750             : {
    3751             :     char        path[MAXPGPATH];
    3752             :     ListCell   *cell;
    3753             :     int         fd;
    3754             :     List       *tles;
    3755             : 
    3756             :     /*
    3757             :      * Loop looking for a suitable timeline ID: we might need to read any of
    3758             :      * the timelines listed in expectedTLEs.
    3759             :      *
    3760             :      * We expect curFileTLI on entry to be the TLI of the preceding file in
    3761             :      * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
    3762             :      * to go backwards; this prevents us from picking up the wrong file when a
    3763             :      * parent timeline extends to higher segment numbers than the child we
    3764             :      * want to read.
    3765             :      *
    3766             :      * If we haven't read the timeline history file yet, read it now, so that
    3767             :      * we know which TLIs to scan.  We don't save the list in expectedTLEs,
    3768             :      * however, unless we actually find a valid segment.  That way if there is
    3769             :      * neither a timeline history file nor a WAL segment in the archive, and
    3770             :      * streaming replication is set up, we'll read the timeline history file
    3771             :      * streamed from the master when we start streaming, instead of recovering
    3772             :      * with a dummy history generated here.
    3773             :      */
    3774        1810 :     if (expectedTLEs)
    3775         420 :         tles = expectedTLEs;
    3776             :     else
    3777        1390 :         tles = readTimeLineHistory(recoveryTargetTLI);
    3778             : 
    3779        1932 :     foreach(cell, tles)
    3780             :     {
    3781        1830 :         TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
    3782        1830 :         TimeLineID  tli = hent->tli;
    3783             : 
    3784        1830 :         if (tli < curFileTLI)
    3785           0 :             break;              /* don't bother looking at too-old TLIs */
    3786             : 
    3787             :         /*
    3788             :          * Skip scanning the timeline ID that the logfile segment to read
    3789             :          * doesn't belong to
    3790             :          */
    3791        1830 :         if (hent->begin != InvalidXLogRecPtr)
    3792             :         {
    3793          70 :             XLogSegNo   beginseg = 0;
    3794             : 
    3795          70 :             XLByteToSeg(hent->begin, beginseg, wal_segment_size);
    3796             : 
    3797             :             /*
    3798             :              * The logfile segment that doesn't belong to the timeline is
    3799             :              * older or newer than the segment that the timeline started or
    3800             :              * ended at, respectively. It's sufficient to check only the
    3801             :              * starting segment of the timeline here. Since the timelines are
    3802             :              * scanned in descending order in this loop, any segments newer
    3803             :              * than the ending segment should belong to newer timeline and
    3804             :              * have already been read before. So it's not necessary to check
    3805             :              * the ending segment of the timeline here.
    3806             :              */
    3807          70 :             if (segno < beginseg)
    3808           8 :                 continue;
    3809             :         }
    3810             : 
    3811        1822 :         if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
    3812             :         {
    3813         356 :             fd = XLogFileRead(segno, emode, tli,
    3814             :                               XLOG_FROM_ARCHIVE, true);
    3815         356 :             if (fd != -1)
    3816             :             {
    3817          18 :                 elog(DEBUG1, "got WAL segment from archive");
    3818          18 :                 if (!expectedTLEs)
    3819           8 :                     expectedTLEs = tles;
    3820        1708 :                 return fd;
    3821             :             }
    3822             :         }
    3823             : 
    3824        1804 :         if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
    3825             :         {
    3826        1804 :             fd = XLogFileRead(segno, emode, tli,
    3827             :                               XLOG_FROM_PG_WAL, true);
    3828        1804 :             if (fd != -1)
    3829             :             {
    3830        1690 :                 if (!expectedTLEs)
    3831        1382 :                     expectedTLEs = tles;
    3832        1690 :                 return fd;
    3833             :             }
    3834             :         }
    3835             :     }
    3836             : 
    3837             :     /* Couldn't find it.  For simplicity, complain about front timeline */
    3838         102 :     XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
    3839         102 :     errno = ENOENT;
    3840         102 :     ereport(emode,
    3841             :             (errcode_for_file_access(),
    3842             :              errmsg("could not open file \"%s\": %m", path)));
    3843         102 :     return -1;
    3844             : }
    3845             : 
    3846             : /*
    3847             :  * Close the current logfile segment for writing.
    3848             :  */
    3849             : static void
    3850        2042 : XLogFileClose(void)
    3851             : {
    3852             :     Assert(openLogFile >= 0);
    3853             : 
    3854             :     /*
    3855             :      * WAL segment files will not be re-read in normal operation, so we advise
    3856             :      * the OS to release any cached pages.  But do not do so if WAL archiving
    3857             :      * or streaming is active, because archiver and walsender process could
    3858             :      * use the cache to read the WAL segment.
    3859             :      */
    3860             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
    3861        2042 :     if (!XLogIsNeeded())
    3862          40 :         (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
    3863             : #endif
    3864             : 
    3865        2042 :     if (close(openLogFile) != 0)
    3866             :     {
    3867             :         char        xlogfname[MAXFNAMELEN];
    3868           0 :         int         save_errno = errno;
    3869             : 
    3870           0 :         XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, wal_segment_size);
    3871           0 :         errno = save_errno;
    3872           0 :         ereport(PANIC,
    3873             :                 (errcode_for_file_access(),
    3874             :                  errmsg("could not close file \"%s\": %m", xlogfname)));
    3875             :     }
    3876             : 
    3877        2042 :     openLogFile = -1;
    3878        2042 :     ReleaseExternalFD();
    3879        2042 : }
    3880             : 
    3881             : /*
    3882             :  * Preallocate log files beyond the specified log endpoint.
    3883             :  *
    3884             :  * XXX this is currently extremely conservative, since it forces only one
    3885             :  * future log segment to exist, and even that only if we are 75% done with
    3886             :  * the current one.  This is only appropriate for very low-WAL-volume systems.
    3887             :  * High-volume systems will be OK once they've built up a sufficient set of
    3888             :  * recycled log segments, but the startup transient is likely to include
    3889             :  * a lot of segment creations by foreground processes, which is not so good.
    3890             :  */
    3891             : static void
    3892        3320 : PreallocXlogFiles(XLogRecPtr endptr)
    3893             : {
    3894             :     XLogSegNo   _logSegNo;
    3895             :     int         lf;
    3896             :     bool        use_existent;
    3897             :     uint64      offset;
    3898             : 
    3899        3320 :     XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
    3900        3320 :     offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
    3901        3320 :     if (offset >= (uint32) (0.75 * wal_segment_size))
    3902             :     {
    3903          34 :         _logSegNo++;
    3904          34 :         use_existent = true;
    3905          34 :         lf = XLogFileInit(_logSegNo, &use_existent, true);
    3906          34 :         close(lf);
    3907          34 :         if (!use_existent)
    3908           4 :             CheckpointStats.ckpt_segs_added++;
    3909             :     }
    3910        3320 : }
    3911             : 
    3912             : /*
    3913             :  * Throws an error if the given log segment has already been removed or
    3914             :  * recycled. The caller should only pass a segment that it knows to have
    3915             :  * existed while the server has been running, as this function always
    3916             :  * succeeds if no WAL segments have been removed since startup.
    3917             :  * 'tli' is only used in the error message.
    3918             :  *
    3919             :  * Note: this function guarantees to keep errno unchanged on return.
    3920             :  * This supports callers that use this to possibly deliver a better
    3921             :  * error message about a missing file, while still being able to throw
    3922             :  * a normal file-access error afterwards, if this does return.
    3923             :  */
    3924             : void
    3925       45254 : CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
    3926             : {
    3927       45254 :     int         save_errno = errno;
    3928             :     XLogSegNo   lastRemovedSegNo;
    3929             : 
    3930       45254 :     SpinLockAcquire(&XLogCtl->info_lck);
    3931       45254 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3932       45254 :     SpinLockRelease(&XLogCtl->info_lck);
    3933             : 
    3934       45254 :     if (segno <= lastRemovedSegNo)
    3935             :     {
    3936             :         char        filename[MAXFNAMELEN];
    3937             : 
    3938           0 :         XLogFileName(filename, tli, segno, wal_segment_size);
    3939           0 :         errno = save_errno;
    3940           0 :         ereport(ERROR,
    3941             :                 (errcode_for_file_access(),
    3942             :                  errmsg("requested WAL segment %s has already been removed",
    3943             :                         filename)));
    3944             :     }
    3945       45254 :     errno = save_errno;
    3946       45254 : }
    3947             : 
    3948             : /*
    3949             :  * Return the last WAL segment removed, or 0 if no segment has been removed
    3950             :  * since startup.
    3951             :  *
    3952             :  * NB: the result can be out of date arbitrarily fast, the caller has to deal
    3953             :  * with that.
    3954             :  */
    3955             : XLogSegNo
    3956         592 : XLogGetLastRemovedSegno(void)
    3957             : {
    3958             :     XLogSegNo   lastRemovedSegNo;
    3959             : 
    3960         592 :     SpinLockAcquire(&XLogCtl->info_lck);
    3961         592 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3962         592 :     SpinLockRelease(&XLogCtl->info_lck);
    3963             : 
    3964         592 :     return lastRemovedSegNo;
    3965             : }
    3966             : 
    3967             : 
    3968             : /*
    3969             :  * Update the last removed segno pointer in shared memory, to reflect that the
    3970             :  * given XLOG file has been removed.
    3971             :  */
    3972             : static void
    3973         718 : UpdateLastRemovedPtr(char *filename)
    3974             : {
    3975             :     uint32      tli;
    3976             :     XLogSegNo   segno;
    3977             : 
    3978         718 :     XLogFromFileName(filename, &tli, &segno, wal_segment_size);
    3979             : 
    3980         718 :     SpinLockAcquire(&XLogCtl->info_lck);
    3981         718 :     if (segno > XLogCtl->lastRemovedSegNo)
    3982         194 :         XLogCtl->lastRemovedSegNo = segno;
    3983         718 :     SpinLockRelease(&XLogCtl->info_lck);
    3984         718 : }
    3985             : 
    3986             : /*
    3987             :  * Remove all temporary log files in pg_wal
    3988             :  *
    3989             :  * This is called at the beginning of recovery after a previous crash,
    3990             :  * at a point where no other processes write fresh WAL data.
    3991             :  */
    3992             : static void
    3993         176 : RemoveTempXlogFiles(void)
    3994             : {
    3995             :     DIR        *xldir;
    3996             :     struct dirent *xlde;
    3997             : 
    3998         176 :     elog(DEBUG2, "removing all temporary WAL segments");
    3999             : 
    4000         176 :     xldir = AllocateDir(XLOGDIR);
    4001         966 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4002             :     {
    4003             :         char        path[MAXPGPATH];
    4004             : 
    4005         790 :         if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
    4006         790 :             continue;
    4007             : 
    4008           0 :         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
    4009           0 :         unlink(path);
    4010           0 :         elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
    4011             :     }
    4012         176 :     FreeDir(xldir);
    4013         176 : }
    4014             : 
    4015             : /*
    4016             :  * Recycle or remove all log files older or equal to passed segno.
    4017             :  *
    4018             :  * endptr is current (or recent) end of xlog, and lastredoptr is the
    4019             :  * redo pointer of the last checkpoint. These are used to determine
    4020             :  * whether we want to recycle rather than delete no-longer-wanted log files.
    4021             :  */
    4022             : static void
    4023        3172 : RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr)
    4024             : {
    4025             :     DIR        *xldir;
    4026             :     struct dirent *xlde;
    4027             :     char        lastoff[MAXFNAMELEN];
    4028             : 
    4029             :     /*
    4030             :      * Construct a filename of the last segment to be kept. The timeline ID
    4031             :      * doesn't matter, we ignore that in the comparison. (During recovery,
    4032             :      * ThisTimeLineID isn't set, so we can't use that.)
    4033             :      */
    4034        3172 :     XLogFileName(lastoff, 0, segno, wal_segment_size);
    4035             : 
    4036        3172 :     elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
    4037             :          lastoff);
    4038             : 
    4039        3172 :     xldir = AllocateDir(XLOGDIR);
    4040             : 
    4041       17806 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4042             :     {
    4043             :         /* Ignore files that are not XLOG segments */
    4044       14634 :         if (!IsXLogFileName(xlde->d_name) &&
    4045        9602 :             !IsPartialXLogFileName(xlde->d_name))
    4046        9602 :             continue;
    4047             : 
    4048             :         /*
    4049             :          * We ignore the timeline part of the XLOG segment identifiers in
    4050             :          * deciding whether a segment is still needed.  This ensures that we
    4051             :          * won't prematurely remove a segment from a parent timeline. We could
    4052             :          * probably be a little more proactive about removing segments of
    4053             :          * non-parent timelines, but that would be a whole lot more
    4054             :          * complicated.
    4055             :          *
    4056             :          * We use the alphanumeric sorting property of the filenames to decide
    4057             :          * which ones are earlier than the lastoff segment.
    4058             :          */
    4059        5032 :         if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
    4060             :         {
    4061         720 :             if (XLogArchiveCheckDone(xlde->d_name))
    4062             :             {
    4063             :                 /* Update the last removed location in shared memory first */
    4064         718 :                 UpdateLastRemovedPtr(xlde->d_name);
    4065             : 
    4066         718 :                 RemoveXlogFile(xlde->d_name, lastredoptr, endptr);
    4067             :             }
    4068             :         }
    4069             :     }
    4070             : 
    4071        3172 :     FreeDir(xldir);
    4072        3172 : }
    4073             : 
    4074             : /*
    4075             :  * Remove WAL files that are not part of the given timeline's history.
    4076             :  *
    4077             :  * This is called during recovery, whenever we switch to follow a new
    4078             :  * timeline, and at the end of recovery when we create a new timeline. We
    4079             :  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
    4080             :  * might be leftover pre-allocated or recycled WAL segments on the old timeline
    4081             :  * that we haven't used yet, and contain garbage. If we just leave them in
    4082             :  * pg_wal, they will eventually be archived, and we can't let that happen.
    4083             :  * Files that belong to our timeline history are valid, because we have
    4084             :  * successfully replayed them, but from others we can't be sure.
    4085             :  *
    4086             :  * 'switchpoint' is the current point in WAL where we switch to new timeline,
    4087             :  * and 'newTLI' is the new timeline we switch to.
    4088             :  */
    4089             : static void
    4090          58 : RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
    4091             : {
    4092             :     DIR        *xldir;
    4093             :     struct dirent *xlde;
    4094             :     char        switchseg[MAXFNAMELEN];
    4095             :     XLogSegNo   endLogSegNo;
    4096             : 
    4097          58 :     XLByteToPrevSeg(switchpoint, endLogSegNo, wal_segment_size);
    4098             : 
    4099             :     /*
    4100             :      * Construct a filename of the last segment to be kept.
    4101             :      */
    4102          58 :     XLogFileName(switchseg, newTLI, endLogSegNo, wal_segment_size);
    4103             : 
    4104          58 :     elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
    4105             :          switchseg);
    4106             : 
    4107          58 :     xldir = AllocateDir(XLOGDIR);
    4108             : 
    4109         492 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4110             :     {
    4111             :         /* Ignore files that are not XLOG segments */
    4112         434 :         if (!IsXLogFileName(xlde->d_name))
    4113         250 :             continue;
    4114             : 
    4115             :         /*
    4116             :          * Remove files that are on a timeline older than the new one we're
    4117             :          * switching to, but with a segment number >= the first segment on the
    4118             :          * new timeline.
    4119             :          */
    4120         184 :         if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
    4121         122 :             strcmp(xlde->d_name + 8, switchseg + 8) > 0)
    4122             :         {
    4123             :             /*
    4124             :              * If the file has already been marked as .ready, however, don't
    4125             :              * remove it yet. It should be OK to remove it - files that are
    4126             :              * not part of our timeline history are not required for recovery
    4127             :              * - but seems safer to let them be archived and removed later.
    4128             :              */
    4129          22 :             if (!XLogArchiveIsReady(xlde->d_name))
    4130          22 :                 RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
    4131             :         }
    4132             :     }
    4133             : 
    4134          58 :     FreeDir(xldir);
    4135          58 : }
    4136             : 
    4137             : /*
    4138             :  * Recycle or remove a log file that's no longer needed.
    4139             :  *
    4140             :  * endptr is current (or recent) end of xlog, and lastredoptr is the
    4141             :  * redo pointer of the last checkpoint. These are used to determine
    4142             :  * whether we want to recycle rather than delete no-longer-wanted log files.
    4143             :  * If lastredoptr is not known, pass invalid, and the function will recycle,
    4144             :  * somewhat arbitrarily, 10 future segments.
    4145             :  */
    4146             : static void
    4147         740 : RemoveXlogFile(const char *segname, XLogRecPtr lastredoptr, XLogRecPtr endptr)
    4148             : {
    4149             :     char        path[MAXPGPATH];
    4150             : #ifdef WIN32
    4151             :     char        newpath[MAXPGPATH];
    4152             : #endif
    4153             :     struct stat statbuf;
    4154             :     XLogSegNo   endlogSegNo;
    4155             :     XLogSegNo   recycleSegNo;
    4156             : 
    4157         740 :     if (wal_recycle)
    4158             :     {
    4159             :         /*
    4160             :          * Initialize info about where to try to recycle to.
    4161             :          */
    4162         740 :         XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
    4163         740 :         if (lastredoptr == InvalidXLogRecPtr)
    4164          22 :             recycleSegNo = endlogSegNo + 10;
    4165             :         else
    4166         718 :             recycleSegNo = XLOGfileslop(lastredoptr);
    4167             :     }
    4168             :     else
    4169           0 :         recycleSegNo = 0;       /* keep compiler quiet */
    4170             : 
    4171         740 :     snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
    4172             : 
    4173             :     /*
    4174             :      * Before deleting the file, see if it can be recycled as a future log
    4175             :      * segment. Only recycle normal files, pg_standby for example can create
    4176             :      * symbolic links pointing to a separate archive directory.
    4177             :      */
    4178         740 :     if (wal_recycle &&
    4179        1472 :         endlogSegNo <= recycleSegNo &&
    4180        2196 :         lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
    4181         732 :         InstallXLogFileSegment(&endlogSegNo, path,
    4182             :                                true, recycleSegNo, true))
    4183             :     {
    4184         712 :         ereport(DEBUG2,
    4185             :                 (errmsg("recycled write-ahead log file \"%s\"",
    4186             :                         segname)));
    4187         712 :         CheckpointStats.ckpt_segs_recycled++;
    4188             :         /* Needn't recheck that slot on future iterations */
    4189         712 :         endlogSegNo++;
    4190             :     }
    4191             :     else
    4192             :     {
    4193             :         /* No need for any more future segments... */
    4194             :         int         rc;
    4195             : 
    4196          28 :         ereport(DEBUG2,
    4197             :                 (errmsg("removing write-ahead log file \"%s\"",
    4198             :                         segname)));
    4199             : 
    4200             : #ifdef WIN32
    4201             : 
    4202             :         /*
    4203             :          * On Windows, if another process (e.g another backend) holds the file
    4204             :          * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
    4205             :          * will still show up in directory listing until the last handle is
    4206             :          * closed. To avoid confusing the lingering deleted file for a live
    4207             :          * WAL file that needs to be archived, rename it before deleting it.
    4208             :          *
    4209             :          * If another process holds the file open without FILE_SHARE_DELETE
    4210             :          * flag, rename will fail. We'll try again at the next checkpoint.
    4211             :          */
    4212             :         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
    4213             :         if (rename(path, newpath) != 0)
    4214             :         {
    4215             :             ereport(LOG,
    4216             :                     (errcode_for_file_access(),
    4217             :                      errmsg("could not rename file \"%s\": %m",
    4218             :                             path)));
    4219             :             return;
    4220             :         }
    4221             :         rc = durable_unlink(newpath, LOG);
    4222             : #else
    4223          28 :         rc = durable_unlink(path, LOG);
    4224             : #endif
    4225          28 :         if (rc != 0)
    4226             :         {
    4227             :             /* Message already logged by durable_unlink() */
    4228           0 :             return;
    4229             :         }
    4230          28 :         CheckpointStats.ckpt_segs_removed++;
    4231             :     }
    4232             : 
    4233         740 :     XLogArchiveCleanup(segname);
    4234             : }
    4235             : 
    4236             : /*
    4237             :  * Verify whether pg_wal and pg_wal/archive_status exist.
    4238             :  * If the latter does not exist, recreate it.
    4239             :  *
    4240             :  * It is not the goal of this function to verify the contents of these
    4241             :  * directories, but to help in cases where someone has performed a cluster
    4242             :  * copy for PITR purposes but omitted pg_wal from the copy.
    4243             :  *
    4244             :  * We could also recreate pg_wal if it doesn't exist, but a deliberate
    4245             :  * policy decision was made not to.  It is fairly common for pg_wal to be
    4246             :  * a symlink, and if that was the DBA's intent then automatically making a
    4247             :  * plain directory would result in degraded performance with no notice.
    4248             :  */
    4249             : static void
    4250        1390 : ValidateXLOGDirectoryStructure(void)
    4251             : {
    4252             :     char        path[MAXPGPATH];
    4253             :     struct stat stat_buf;
    4254             : 
    4255             :     /* Check for pg_wal; if it doesn't exist, error out */
    4256        1390 :     if (stat(XLOGDIR, &stat_buf) != 0 ||
    4257        1390 :         !S_ISDIR(stat_buf.st_mode))
    4258           0 :         ereport(FATAL,
    4259             :                 (errmsg("required WAL directory \"%s\" does not exist",
    4260             :                         XLOGDIR)));
    4261             : 
    4262             :     /* Check for archive_status */
    4263        1390 :     snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
    4264        1390 :     if (stat(path, &stat_buf) == 0)
    4265             :     {
    4266             :         /* Check for weird cases where it exists but isn't a directory */
    4267        1390 :         if (!S_ISDIR(stat_buf.st_mode))
    4268           0 :             ereport(FATAL,
    4269             :                     (errmsg("required WAL directory \"%s\" does not exist",
    4270             :                             path)));
    4271             :     }
    4272             :     else
    4273             :     {
    4274           0 :         ereport(LOG,
    4275             :                 (errmsg("creating missing WAL directory \"%s\"", path)));
    4276           0 :         if (MakePGDirectory(path) < 0)
    4277           0 :             ereport(FATAL,
    4278             :                     (errmsg("could not create missing directory \"%s\": %m",
    4279             :                             path)));
    4280             :     }
    4281        1390 : }
    4282             : 
    4283             : /*
    4284             :  * Remove previous backup history files.  This also retries creation of
    4285             :  * .ready files for any backup history files for which XLogArchiveNotify
    4286             :  * failed earlier.
    4287             :  */
    4288             : static void
    4289         136 : CleanupBackupHistory(void)
    4290             : {
    4291             :     DIR        *xldir;
    4292             :     struct dirent *xlde;
    4293             :     char        path[MAXPGPATH + sizeof(XLOGDIR)];
    4294             : 
    4295         136 :     xldir = AllocateDir(XLOGDIR);
    4296             : 
    4297        1040 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4298             :     {
    4299         904 :         if (IsBackupHistoryFileName(xlde->d_name))
    4300             :         {
    4301         136 :             if (XLogArchiveCheckDone(xlde->d_name))
    4302             :             {
    4303         130 :                 elog(DEBUG2, "removing WAL backup history file \"%s\"",
    4304             :                      xlde->d_name);
    4305         130 :                 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
    4306         130 :                 unlink(path);
    4307         130 :                 XLogArchiveCleanup(xlde->d_name);
    4308             :             }
    4309             :         }
    4310             :     }
    4311             : 
    4312         136 :     FreeDir(xldir);
    4313         136 : }
    4314             : 
    4315             : /*
    4316             :  * Attempt to read the next XLOG record.
    4317             :  *
    4318             :  * Before first call, the reader needs to be positioned to the first record
    4319             :  * by calling XLogBeginRead().
    4320             :  *
    4321             :  * If no valid record is available, returns NULL, or fails if emode is PANIC.
    4322             :  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
    4323             :  * record is available.
    4324             :  */
    4325             : static XLogRecord *
    4326      576502 : ReadRecord(XLogReaderState *xlogreader, int emode,
    4327             :            bool fetching_ckpt)
    4328             : {
    4329             :     XLogRecord *record;
    4330      576502 :     XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
    4331             : 
    4332             :     /* Pass through parameters to XLogPageRead */
    4333      576502 :     private->fetching_ckpt = fetching_ckpt;
    4334      576502 :     private->emode = emode;
    4335      576502 :     private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
    4336             : 
    4337             :     /* This is the first attempt to read this page. */
    4338      576502 :     lastSourceFailed = false;
    4339             : 
    4340             :     for (;;)
    4341          92 :     {
    4342             :         char       *errormsg;
    4343             : 
    4344      576594 :         record = XLogReadRecord(xlogreader, &errormsg);
    4345      576562 :         ReadRecPtr = xlogreader->ReadRecPtr;
    4346      576562 :         EndRecPtr = xlogreader->EndRecPtr;
    4347      576562 :         if (record == NULL)
    4348             :         {
    4349         256 :             if (readFile >= 0)
    4350             :             {
    4351         232 :                 close(readFile);
    4352         232 :                 readFile = -1;
    4353             :             }
    4354             : 
    4355             :             /*
    4356             :              * We only end up here without a message when XLogPageRead()
    4357             :              * failed - in that case we already logged something. In
    4358             :              * StandbyMode that only happens if we have been triggered, so we
    4359             :              * shouldn't loop anymore in that case.
    4360             :              */
    4361         256 :             if (errormsg)
    4362         232 :                 ereport(emode_for_corrupt_record(emode, EndRecPtr),
    4363             :                         (errmsg_internal("%s", errormsg) /* already translated */ ));
    4364             :         }
    4365             : 
    4366             :         /*
    4367             :          * Check page TLI is one of the expected values.
    4368             :          */
    4369      576306 :         else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
    4370             :         {
    4371             :             char        fname[MAXFNAMELEN];
    4372             :             XLogSegNo   segno;
    4373             :             int32       offset;
    4374             : 
    4375           0 :             XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
    4376           0 :             offset = XLogSegmentOffset(xlogreader->latestPagePtr,
    4377             :                                        wal_segment_size);
    4378           0 :             XLogFileName(fname, xlogreader->seg.ws_tli, segno,
    4379             :                          wal_segment_size);
    4380           0 :             ereport(emode_for_corrupt_record(emode, EndRecPtr),
    4381             :                     (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
    4382             :                             xlogreader->latestPageTLI,
    4383             :                             fname,
    4384             :                             offset)));
    4385           0 :             record = NULL;
    4386             :         }
    4387             : 
    4388      576562 :         if (record)
    4389             :         {
    4390             :             /* Great, got a record */
    4391      576470 :             return record;
    4392             :         }
    4393             :         else
    4394             :         {
    4395             :             /* No valid record available from this source */
    4396         256 :             lastSourceFailed = true;
    4397             : 
    4398             :             /*
    4399             :              * If archive recovery was requested, but we were still doing
    4400             :              * crash recovery, switch to archive recovery and retry using the
    4401             :              * offline archive. We have now replayed all the valid WAL in
    4402             :              * pg_wal, so we are presumably now consistent.
    4403             :              *
    4404             :              * We require that there's at least some valid WAL present in
    4405             :              * pg_wal, however (!fetching_ckpt).  We could recover using the
    4406             :              * WAL from the archive, even if pg_wal is completely empty, but
    4407             :              * we'd have no idea how far we'd have to replay to reach
    4408             :              * consistency.  So err on the safe side and give up.
    4409             :              */
    4410         256 :             if (!InArchiveRecovery && ArchiveRecoveryRequested &&
    4411           2 :                 !fetching_ckpt)
    4412             :             {
    4413           2 :                 ereport(DEBUG1,
    4414             :                         (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
    4415           2 :                 InArchiveRecovery = true;
    4416           2 :                 if (StandbyModeRequested)
    4417           2 :                     StandbyMode = true;
    4418             : 
    4419             :                 /* initialize minRecoveryPoint to this record */
    4420           2 :                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    4421           2 :                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
    4422           2 :                 if (ControlFile->minRecoveryPoint < EndRecPtr)
    4423             :                 {
    4424           2 :                     ControlFile->minRecoveryPoint = EndRecPtr;
    4425           2 :                     ControlFile->minRecoveryPointTLI = ThisTimeLineID;
    4426             :                 }
    4427             :                 /* update local copy */
    4428           2 :                 minRecoveryPoint = ControlFile->minRecoveryPoint;
    4429           2 :                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    4430             : 
    4431             :                 /*
    4432             :                  * The startup process can update its local copy of
    4433             :                  * minRecoveryPoint from this point.
    4434             :                  */
    4435           2 :                 updateMinRecoveryPoint = true;
    4436             : 
    4437           2 :                 UpdateControlFile();
    4438             : 
    4439             :                 /*
    4440             :                  * We update SharedRecoveryState while holding the lock on
    4441             :                  * ControlFileLock so both states are consistent in shared
    4442             :                  * memory.
    4443             :                  */
    4444           2 :                 SpinLockAcquire(&XLogCtl->info_lck);
    4445           2 :                 XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
    4446           2 :                 SpinLockRelease(&XLogCtl->info_lck);
    4447             : 
    4448           2 :                 LWLockRelease(ControlFileLock);
    4449             : 
    4450           2 :                 CheckRecoveryConsistency();
    4451             : 
    4452             :                 /*
    4453             :                  * Before we retry, reset lastSourceFailed and currentSource
    4454             :                  * so that we will check the archive next.
    4455             :                  */
    4456           2 :                 lastSourceFailed = false;
    4457           2 :                 currentSource = XLOG_FROM_ANY;
    4458             : 
    4459          92 :                 continue;
    4460             :             }
    4461             : 
    4462             :             /* In standby mode, loop back to retry. Otherwise, give up. */
    4463         254 :             if (StandbyMode && !CheckForStandbyTrigger())
    4464          90 :                 continue;
    4465             :             else
    4466         164 :                 return NULL;
    4467             :         }
    4468             :     }
    4469             : }
    4470             : 
    4471             : /*
    4472             :  * Scan for new timelines that might have appeared in the archive since we
    4473             :  * started recovery.
    4474             :  *
    4475             :  * If there are any, the function changes recovery target TLI to the latest
    4476             :  * one and returns 'true'.
    4477             :  */
    4478             : static bool
    4479         142 : rescanLatestTimeLine(void)
    4480             : {
    4481             :     List       *newExpectedTLEs;
    4482             :     bool        found;
    4483             :     ListCell   *cell;
    4484             :     TimeLineID  newtarget;
    4485         142 :     TimeLineID  oldtarget = recoveryTargetTLI;
    4486         142 :     TimeLineHistoryEntry *currentTle = NULL;
    4487             : 
    4488         142 :     newtarget = findNewestTimeLine(recoveryTargetTLI);
    4489         142 :     if (newtarget == recoveryTargetTLI)
    4490             :     {
    4491             :         /* No new timelines found */
    4492         136 :         return false;
    4493             :     }
    4494             : 
    4495             :     /*
    4496             :      * Determine the list of expected TLIs for the new TLI
    4497             :      */
    4498             : 
    4499           6 :     newExpectedTLEs = readTimeLineHistory(newtarget);
    4500             : 
    4501             :     /*
    4502             :      * If the current timeline is not part of the history of the new timeline,
    4503             :      * we cannot proceed to it.
    4504             :      */
    4505           6 :     found = false;
    4506          12 :     foreach(cell, newExpectedTLEs)
    4507             :     {
    4508          12 :         currentTle = (TimeLineHistoryEntry *) lfirst(cell);
    4509             : 
    4510          12 :         if (currentTle->tli == recoveryTargetTLI)
    4511             :         {
    4512           6 :             found = true;
    4513           6 :             break;
    4514             :         }
    4515             :     }
    4516           6 :     if (!found)
    4517             :     {
    4518           0 :         ereport(LOG,
    4519             :                 (errmsg("new timeline %u is not a child of database system timeline %u",
    4520             :                         newtarget,
    4521             :                         ThisTimeLineID)));
    4522           0 :         return false;
    4523             :     }
    4524             : 
    4525             :     /*
    4526             :      * The current timeline was found in the history file, but check that the
    4527             :      * next timeline was forked off from it *after* the current recovery
    4528             :      * location.
    4529             :      */
    4530           6 :     if (currentTle->end < EndRecPtr)
    4531             :     {
    4532           0 :         ereport(LOG,
    4533             :                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
    4534             :                         newtarget,
    4535             :                         ThisTimeLineID,
    4536             :                         (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
    4537           0 :         return false;
    4538             :     }
    4539             : 
    4540             :     /* The new timeline history seems valid. Switch target */
    4541           6 :     recoveryTargetTLI = newtarget;
    4542           6 :     list_free_deep(expectedTLEs);
    4543           6 :     expectedTLEs = newExpectedTLEs;
    4544             : 
    4545             :     /*
    4546             :      * As in StartupXLOG(), try to ensure we have all the history files
    4547             :      * between the old target and new target in pg_wal.
    4548             :      */
    4549           6 :     restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
    4550             : 
    4551           6 :     ereport(LOG,
    4552             :             (errmsg("new target timeline is %u",
    4553             :                     recoveryTargetTLI)));
    4554             : 
    4555           6 :     return true;
    4556             : }
    4557             : 
    4558             : /*
    4559             :  * I/O routines for pg_control
    4560             :  *
    4561             :  * *ControlFile is a buffer in shared memory that holds an image of the
    4562             :  * contents of pg_control.  WriteControlFile() initializes pg_control
    4563             :  * given a preloaded buffer, ReadControlFile() loads the buffer from
    4564             :  * the pg_control file (during postmaster or standalone-backend startup),
    4565             :  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
    4566             :  * InitControlFile() fills the buffer with initial values.
    4567             :  *
    4568             :  * For simplicity, WriteControlFile() initializes the fields of pg_control
    4569             :  * that are related to checking backend/database compatibility, and
    4570             :  * ReadControlFile() verifies they are correct.  We could split out the
    4571             :  * I/O and compatibility-check functions, but there seems no need currently.
    4572             :  */
    4573             : 
    4574             : static void
    4575         358 : InitControlFile(uint64 sysidentifier)
    4576             : {
    4577             :     char        mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
    4578             : 
    4579             :     /*
    4580             :      * Generate a random nonce. This is used for authentication requests that
    4581             :      * will fail because the user does not exist. The nonce is used to create
    4582             :      * a genuine-looking password challenge for the non-existent user, in lieu
    4583             :      * of an actual stored password.
    4584             :      */
    4585         358 :     if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
    4586           0 :         ereport(PANIC,
    4587             :                 (errcode(ERRCODE_INTERNAL_ERROR),
    4588             :                  errmsg("could not generate secret authorization token")));
    4589             : 
    4590         358 :     memset(ControlFile, 0, sizeof(ControlFileData));
    4591             :     /* Initialize pg_control status fields */
    4592         358 :     ControlFile->system_identifier = sysidentifier;
    4593         358 :     memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
    4594         358 :     ControlFile->state = DB_SHUTDOWNED;
    4595         358 :     ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
    4596             : 
    4597             :     /* Set important parameter values for use when replaying WAL */
    4598         358 :     ControlFile->MaxConnections = MaxConnections;
    4599         358 :     ControlFile->max_worker_processes = max_worker_processes;
    4600         358 :     ControlFile->max_wal_senders = max_wal_senders;
    4601         358 :     ControlFile->max_prepared_xacts = max_prepared_xacts;
    4602         358 :     ControlFile->max_locks_per_xact = max_locks_per_xact;
    4603         358 :     ControlFile->wal_level = wal_level;
    4604         358 :     ControlFile->wal_log_hints = wal_log_hints;
    4605         358 :     ControlFile->track_commit_timestamp = track_commit_timestamp;
    4606         358 :     ControlFile->data_checksum_version = bootstrap_data_checksum_version;
    4607         358 : }
    4608             : 
    4609             : static void
    4610         358 : WriteControlFile(void)
    4611             : {
    4612             :     int         fd;
    4613             :     char        buffer[PG_CONTROL_FILE_SIZE];   /* need not be aligned */
    4614             : 
    4615             :     /*
    4616             :      * Ensure that the size of the pg_control data structure is sane.  See the
    4617             :      * comments for these symbols in pg_control.h.
    4618             :      */
    4619             :     StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
    4620             :                      "pg_control is too large for atomic disk writes");
    4621             :     StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
    4622             :                      "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
    4623             : 
    4624             :     /*
    4625             :      * Initialize version and compatibility-check fields
    4626             :      */
    4627         358 :     ControlFile->pg_control_version = PG_CONTROL_VERSION;
    4628         358 :     ControlFile->catalog_version_no = CATALOG_VERSION_NO;
    4629             : 
    4630         358 :     ControlFile->maxAlign = MAXIMUM_ALIGNOF;
    4631         358 :     ControlFile->floatFormat = FLOATFORMAT_VALUE;
    4632             : 
    4633         358 :     ControlFile->blcksz = BLCKSZ;
    4634         358 :     ControlFile->relseg_size = RELSEG_SIZE;
    4635         358 :     ControlFile->xlog_blcksz = XLOG_BLCKSZ;
    4636         358 :     ControlFile->xlog_seg_size = wal_segment_size;
    4637             : 
    4638         358 :     ControlFile->nameDataLen = NAMEDATALEN;
    4639         358 :     ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
    4640             : 
    4641         358 :     ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
    4642         358 :     ControlFile->loblksize = LOBLKSIZE;
    4643             : 
    4644         358 :     ControlFile->float8ByVal = FLOAT8PASSBYVAL;
    4645             : 
    4646             :     /* Contents are protected with a CRC */
    4647         358 :     INIT_CRC32C(ControlFile->crc);
    4648         358 :     COMP_CRC32C(ControlFile->crc,
    4649             :                 (char *) ControlFile,
    4650             :                 offsetof(ControlFileData, crc));
    4651         358 :     FIN_CRC32C(ControlFile->crc);
    4652             : 
    4653             :     /*
    4654             :      * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
    4655             :      * the excess over sizeof(ControlFileData).  This reduces the odds of
    4656             :      * premature-EOF errors when reading pg_control.  We'll still fail when we
    4657             :      * check the contents of the file, but hopefully with a more specific
    4658             :      * error than "couldn't read pg_control".
    4659             :      */
    4660         358 :     memset(buffer, 0, PG_CONTROL_FILE_SIZE);
    4661         358 :     memcpy(buffer, ControlFile, sizeof(ControlFileData));
    4662             : 
    4663         358 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4664             :                        O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
    4665         358 :     if (fd < 0)
    4666           0 :         ereport(PANIC,
    4667             :                 (errcode_for_file_access(),
    4668             :                  errmsg("could not create file \"%s\": %m",
    4669             :                         XLOG_CONTROL_FILE)));
    4670             : 
    4671         358 :     errno = 0;
    4672         358 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
    4673         358 :     if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
    4674             :     {
    4675             :         /* if write didn't set errno, assume problem is no disk space */
    4676           0 :         if (errno == 0)
    4677           0 :             errno = ENOSPC;
    4678           0 :         ereport(PANIC,
    4679             :                 (errcode_for_file_access(),
    4680             :                  errmsg("could not write to file \"%s\": %m",
    4681             :                         XLOG_CONTROL_FILE)));
    4682             :     }
    4683         358 :     pgstat_report_wait_end();
    4684             : 
    4685         358 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
    4686         358 :     if (pg_fsync(fd) != 0)
    4687           0 :         ereport(PANIC,
    4688             :                 (errcode_for_file_access(),
    4689             :                  errmsg("could not fsync file \"%s\": %m",
    4690             :                         XLOG_CONTROL_FILE)));
    4691         358 :     pgstat_report_wait_end();
    4692             : 
    4693         358 :     if (close(fd) != 0)
    4694           0 :         ereport(PANIC,
    4695             :                 (errcode_for_file_access(),
    4696             :                  errmsg("could not close file \"%s\": %m",
    4697             :                         XLOG_CONTROL_FILE)));
    4698         358 : }
    4699             : 
    4700             : static void
    4701        1464 : ReadControlFile(void)
    4702             : {
    4703             :     pg_crc32c   crc;
    4704             :     int         fd;
    4705             :     static char wal_segsz_str[20];
    4706             :     int         r;
    4707             : 
    4708             :     /*
    4709             :      * Read data...
    4710             :      */
    4711        1464 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4712             :                        O_RDWR | PG_BINARY);
    4713        1464 :     if (fd < 0)
    4714           0 :         ereport(PANIC,
    4715             :                 (errcode_for_file_access(),
    4716             :                  errmsg("could not open file \"%s\": %m",
    4717             :                         XLOG_CONTROL_FILE)));
    4718             : 
    4719        1464 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
    4720        1464 :     r = read(fd, ControlFile, sizeof(ControlFileData));
    4721        1464 :     if (r != sizeof(ControlFileData))
    4722             :     {
    4723           0 :         if (r < 0)
    4724           0 :             ereport(PANIC,
    4725             :                     (errcode_for_file_access(),
    4726             :                      errmsg("could not read file \"%s\": %m",
    4727             :                             XLOG_CONTROL_FILE)));
    4728             :         else
    4729           0 :             ereport(PANIC,
    4730             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    4731             :                      errmsg("could not read file \"%s\": read %d of %zu",
    4732             :                             XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
    4733             :     }
    4734        1464 :     pgstat_report_wait_end();
    4735             : 
    4736        1464 :     close(fd);
    4737             : 
    4738             :     /*
    4739             :      * Check for expected pg_control format version.  If this is wrong, the
    4740             :      * CRC check will likely fail because we'll be checking the wrong number
    4741             :      * of bytes.  Complaining about wrong version will probably be more
    4742             :      * enlightening than complaining about wrong CRC.
    4743             :      */
    4744             : 
    4745        1464 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
    4746           0 :         ereport(FATAL,
    4747             :                 (errmsg("database files are incompatible with server"),
    4748             :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
    4749             :                            " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
    4750             :                            ControlFile->pg_control_version, ControlFile->pg_control_version,
    4751             :                            PG_CONTROL_VERSION, PG_CONTROL_VERSION),
    4752             :                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
    4753             : 
    4754        1464 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
    4755           0 :         ereport(FATAL,
    4756             :                 (errmsg("database files are incompatible with server"),
    4757             :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
    4758             :                            " but the server was compiled with PG_CONTROL_VERSION %d.",
    4759             :                            ControlFile->pg_control_version, PG_CONTROL_VERSION),
    4760             :                  errhint("It looks like you need to initdb.")));
    4761             : 
    4762             :     /* Now check the CRC. */
    4763        1464 :     INIT_CRC32C(crc);
    4764        1464 :     COMP_CRC32C(crc,
    4765             :                 (char *) ControlFile,
    4766             :                 offsetof(ControlFileData, crc));
    4767        1464 :     FIN_CRC32C(crc);
    4768             : 
    4769        1464 :     if (!EQ_CRC32C(crc, ControlFile->crc))
    4770           0 :         ereport(FATAL,
    4771             :                 (errmsg("incorrect checksum in control file")));
    4772             : 
    4773             :     /*
    4774             :      * Do compatibility checking immediately.  If the database isn't
    4775             :      * compatible with the backend executable, we want to abort before we can
    4776             :      * possibly do any damage.
    4777             :      */
    4778        1464 :     if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
    4779           0 :         ereport(FATAL,
    4780             :                 (errmsg("database files are incompatible with server"),
    4781             :                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
    4782             :                            " but the server was compiled with CATALOG_VERSION_NO %d.",
    4783             :                            ControlFile->catalog_version_no, CATALOG_VERSION_NO),
    4784             :                  errhint("It looks like you need to initdb.")));
    4785        1464 :     if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
    4786           0 :         ereport(FATAL,
    4787             :                 (errmsg("database files are incompatible with server"),
    4788             :                  errdetail("The database cluster was initialized with MAXALIGN %d,"
    4789             :                            " but the server was compiled with MAXALIGN %d.",
    4790             :                            ControlFile->maxAlign, MAXIMUM_ALIGNOF),
    4791             :                  errhint("It looks like you need to initdb.")));
    4792        1464 :     if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
    4793           0 :         ereport(FATAL,
    4794             :                 (errmsg("database files are incompatible with server"),
    4795             :                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
    4796             :                  errhint("It looks like you need to initdb.")));
    4797        1464 :     if (ControlFile->blcksz != BLCKSZ)
    4798           0 :         ereport(FATAL,
    4799             :                 (errmsg("database files are incompatible with server"),
    4800             :                  errdetail("The database cluster was initialized with BLCKSZ %d,"
    4801             :                            " but the server was compiled with BLCKSZ %d.",
    4802             :                            ControlFile->blcksz, BLCKSZ),
    4803             :                  errhint("It looks like you need to recompile or initdb.")));
    4804        1464 :     if (ControlFile->relseg_size != RELSEG_SIZE)
    4805           0 :         ereport(FATAL,
    4806             :                 (errmsg("database files are incompatible with server"),
    4807             :                  errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
    4808             :                            " but the server was compiled with RELSEG_SIZE %d.",
    4809             :                            ControlFile->relseg_size, RELSEG_SIZE),
    4810             :                  errhint("It looks like you need to recompile or initdb.")));
    4811        1464 :     if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
    4812           0 :         ereport(FATAL,
    4813             :                 (errmsg("database files are incompatible with server"),
    4814             :                  errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
    4815             :                            " but the server was compiled with XLOG_BLCKSZ %d.",
    4816             :                            ControlFile->xlog_blcksz, XLOG_BLCKSZ),
    4817             :                  errhint("It looks like you need to recompile or initdb.")));
    4818        1464 :     if (ControlFile->nameDataLen != NAMEDATALEN)
    4819           0 :         ereport(FATAL,
    4820             :                 (errmsg("database files are incompatible with server"),
    4821             :                  errdetail("The database cluster was initialized with NAMEDATALEN %d,"
    4822             :                            " but the server was compiled with NAMEDATALEN %d.",
    4823             :                            ControlFile->nameDataLen, NAMEDATALEN),
    4824             :                  errhint("It looks like you need to recompile or initdb.")));
    4825        1464 :     if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
    4826           0 :         ereport(FATAL,
    4827             :                 (errmsg("database files are incompatible with server"),
    4828             :                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
    4829             :                            " but the server was compiled with INDEX_MAX_KEYS %d.",
    4830             :                            ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
    4831             :                  errhint("It looks like you need to recompile or initdb.")));
    4832        1464 :     if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
    4833           0 :         ereport(FATAL,
    4834             :                 (errmsg("database files are incompatible with server"),
    4835             :                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
    4836             :                            " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
    4837             :                            ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
    4838             :                  errhint("It looks like you need to recompile or initdb.")));
    4839        1464 :     if (ControlFile->loblksize != LOBLKSIZE)
    4840           0 :         ereport(FATAL,
    4841             :                 (errmsg("database files are incompatible with server"),
    4842             :                  errdetail("The database cluster was initialized with LOBLKSIZE %d,"
    4843             :                            " but the server was compiled with LOBLKSIZE %d.",
    4844             :                            ControlFile->loblksize, (int) LOBLKSIZE),
    4845             :                  errhint("It looks like you need to recompile or initdb.")));
    4846             : 
    4847             : #ifdef USE_FLOAT8_BYVAL
    4848        1464 :     if (ControlFile->float8ByVal != true)
    4849           0 :         ereport(FATAL,
    4850             :                 (errmsg("database files are incompatible with server"),
    4851             :                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
    4852             :                            " but the server was compiled with USE_FLOAT8_BYVAL."),
    4853             :                  errhint("It looks like you need to recompile or initdb.")));
    4854             : #else
    4855             :     if (ControlFile->float8ByVal != false)
    4856             :         ereport(FATAL,
    4857             :                 (errmsg("database files are incompatible with server"),
    4858             :                  errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
    4859             :                            " but the server was compiled without USE_FLOAT8_BYVAL."),
    4860             :                  errhint("It looks like you need to recompile or initdb.")));
    4861             : #endif
    4862             : 
    4863        1464 :     wal_segment_size = ControlFile->xlog_seg_size;
    4864             : 
    4865        1464 :     if (!IsValidWalSegSize(wal_segment_size))
    4866           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4867             :                         errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
    4868             :                                       "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
    4869             :                                       wal_segment_size,
    4870             :                                       wal_segment_size)));
    4871             : 
    4872        1464 :     snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
    4873        1464 :     SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
    4874             :                     PGC_S_OVERRIDE);
    4875             : 
    4876             :     /* check and update variables dependent on wal_segment_size */
    4877        1464 :     if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
    4878           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4879             :                         errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
    4880             : 
    4881        1464 :     if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
    4882           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4883             :                         errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
    4884             : 
    4885        1464 :     UsableBytesInSegment =
    4886        1464 :         (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
    4887             :         (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
    4888             : 
    4889        1464 :     CalculateCheckpointSegments();
    4890             : 
    4891             :     /* Make the initdb settings visible as GUC variables, too */
    4892        1464 :     SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
    4893             :                     PGC_INTERNAL, PGC_S_OVERRIDE);
    4894        1464 : }
    4895             : 
    4896             : /*
    4897             :  * Utility wrapper to update the control file.  Note that the control
    4898             :  * file gets flushed.
    4899             :  */
    4900             : void
    4901        6784 : UpdateControlFile(void)
    4902             : {
    4903        6784 :     update_controlfile(DataDir, ControlFile, true);
    4904        6784 : }
    4905             : 
    4906             : /*
    4907             :  * Returns the unique system identifier from control file.
    4908             :  */
    4909             : uint64
    4910         592 : GetSystemIdentifier(void)
    4911             : {
    4912             :     Assert(ControlFile != NULL);
    4913         592 :     return ControlFile->system_identifier;
    4914             : }
    4915             : 
    4916             : /*
    4917             :  * Returns the random nonce from control file.
    4918             :  */
    4919             : char *
    4920           2 : GetMockAuthenticationNonce(void)
    4921             : {
    4922             :     Assert(ControlFile != NULL);
    4923           2 :     return ControlFile->mock_authentication_nonce;
    4924             : }
    4925             : 
    4926             : /*
    4927             :  * Are checksums enabled for data pages?
    4928             :  */
    4929             : bool
    4930    14926094 : DataChecksumsEnabled(void)
    4931             : {
    4932             :     Assert(ControlFile != NULL);
    4933    14926094 :     return (ControlFile->data_checksum_version > 0);
    4934             : }
    4935             : 
    4936             : /*
    4937             :  * Returns a fake LSN for unlogged relations.
    4938             :  *
    4939             :  * Each call generates an LSN that is greater than any previous value
    4940             :  * returned. The current counter value is saved and restored across clean
    4941             :  * shutdowns, but like unlogged relations, does not survive a crash. This can
    4942             :  * be used in lieu of real LSN values returned by XLogInsert, if you need an
    4943             :  * LSN-like increasing sequence of numbers without writing any WAL.
    4944             :  */
    4945             : XLogRecPtr
    4946           0 : GetFakeLSNForUnloggedRel(void)
    4947             : {
    4948             :     XLogRecPtr  nextUnloggedLSN;
    4949             : 
    4950             :     /* increment the unloggedLSN counter, need SpinLock */
    4951           0 :     SpinLockAcquire(&XLogCtl->ulsn_lck);
    4952           0 :     nextUnloggedLSN = XLogCtl->unloggedLSN++;
    4953           0 :     SpinLockRelease(&XLogCtl->ulsn_lck);
    4954             : 
    4955           0 :     return nextUnloggedLSN;
    4956             : }
    4957             : 
    4958             : /*
    4959             :  * Auto-tune the number of XLOG buffers.
    4960             :  *
    4961             :  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
    4962             :  * a maximum of one XLOG segment (there is little reason to think that more
    4963             :  * is helpful, at least so long as we force an fsync when switching log files)
    4964             :  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
    4965             :  * 9.1, when auto-tuning was added).
    4966             :  *
    4967             :  * This should not be called until NBuffers has received its final value.
    4968             :  */
    4969             : static int
    4970        2170 : XLOGChooseNumBuffers(void)
    4971             : {
    4972             :     int         xbuffers;
    4973             : 
    4974        2170 :     xbuffers = NBuffers / 32;
    4975        2170 :     if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
    4976          24 :         xbuffers = (wal_segment_size / XLOG_BLCKSZ);
    4977        2170 :     if (xbuffers < 8)
    4978         358 :         xbuffers = 8;
    4979        2170 :     return xbuffers;
    4980             : }
    4981             : 
    4982             : /*
    4983             :  * GUC check_hook for wal_buffers
    4984             :  */
    4985             : bool
    4986        4354 : check_wal_buffers(int *newval, void **extra, GucSource source)
    4987             : {
    4988             :     /*
    4989             :      * -1 indicates a request for auto-tune.
    4990             :      */
    4991        4354 :     if (*newval == -1)
    4992             :     {
    4993             :         /*
    4994             :          * If we haven't yet changed the boot_val default of -1, just let it
    4995             :          * be.  We'll fix it when XLOGShmemSize is called.
    4996             :          */
    4997        2184 :         if (XLOGbuffers == -1)
    4998        2184 :             return true;
    4999             : 
    5000             :         /* Otherwise, substitute the auto-tune value */
    5001           0 :         *newval = XLOGChooseNumBuffers();
    5002             :     }
    5003             : 
    5004             :     /*
    5005             :      * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
    5006             :      * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
    5007             :      * the case, we just silently treat such values as a request for the
    5008             :      * minimum.  (We could throw an error instead, but that doesn't seem very
    5009             :      * helpful.)
    5010             :      */
    5011        2170 :     if (*newval < 4)
    5012           0 :         *newval = 4;
    5013             : 
    5014        2170 :     return true;
    5015             : }
    5016             : 
    5017             : /*
    5018             :  * Read the control file, set respective GUCs.
    5019             :  *
    5020             :  * This is to be called during startup, including a crash recovery cycle,
    5021             :  * unless in bootstrap mode, where no control file yet exists.  As there's no
    5022             :  * usable shared memory yet (its sizing can depend on the contents of the
    5023             :  * control file!), first store the contents in local memory. XLOGShmemInit()
    5024             :  * will then copy it to shared memory later.
    5025             :  *
    5026             :  * reset just controls whether previous contents are to be expected (in the
    5027             :  * reset case, there's a dangling pointer into old shared memory), or not.
    5028             :  */
    5029             : void
    5030        1106 : LocalProcessControlFile(bool reset)
    5031             : {
    5032             :     Assert(reset || ControlFile == NULL);
    5033        1106 :     ControlFile = palloc(sizeof(ControlFileData));
    5034        1106 :     ReadControlFile();
    5035        1106 : }
    5036             : 
    5037             : /*
    5038             :  * Initialization of shared memory for XLOG
    5039             :  */
    5040             : Size
    5041        4344 : XLOGShmemSize(void)
    5042             : {
    5043             :     Size        size;
    5044             : 
    5045             :     /*
    5046             :      * If the value of wal_buffers is -1, use the preferred auto-tune value.
    5047             :      * This isn't an amazingly clean place to do this, but we must wait till
    5048             :      * NBuffers has received its final value, and must do it before using the
    5049             :      * value of XLOGbuffers to do anything important.
    5050             :      */
    5051        4344 :     if (XLOGbuffers == -1)
    5052             :     {
    5053             :         char        buf[32];
    5054             : 
    5055        2170 :         snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
    5056        2170 :         SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
    5057             :     }
    5058             :     Assert(XLOGbuffers > 0);
    5059             : 
    5060             :     /* XLogCtl */
    5061        4344 :     size = sizeof(XLogCtlData);
    5062             : 
    5063             :     /* WAL insertion locks, plus alignment */
    5064        4344 :     size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
    5065             :     /* xlblocks array */
    5066        4344 :     size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
    5067             :     /* extra alignment padding for XLOG I/O buffers */
    5068        4344 :     size = add_size(size, XLOG_BLCKSZ);
    5069             :     /* and the buffers themselves */
    5070        4344 :     size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
    5071             : 
    5072             :     /*
    5073             :      * Note: we don't count ControlFileData, it comes out of the "slop factor"
    5074             :      * added by CreateSharedMemoryAndSemaphores.  This lets us use this
    5075             :      * routine again below to compute the actual allocation size.
    5076             :      */
    5077             : 
    5078        4344 :     return size;
    5079             : }
    5080             : 
    5081             : void
    5082        2170 : XLOGShmemInit(void)
    5083             : {
    5084             :     bool        foundCFile,
    5085             :                 foundXLog;
    5086             :     char       *allocptr;
    5087             :     int         i;
    5088             :     ControlFileData *localControlFile;
    5089             : 
    5090             : #ifdef WAL_DEBUG
    5091             : 
    5092             :     /*
    5093             :      * Create a memory context for WAL debugging that's exempt from the normal
    5094             :      * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
    5095             :      * an allocation fails, but wal_debug is not for production use anyway.
    5096             :      */
    5097             :     if (walDebugCxt == NULL)
    5098             :     {
    5099             :         walDebugCxt = AllocSetContextCreate(TopMemoryContext,
    5100             :                                             "WAL Debug",
    5101             :                                             ALLOCSET_DEFAULT_SIZES);
    5102             :         MemoryContextAllowInCriticalSection(walDebugCxt, true);
    5103             :     }
    5104             : #endif
    5105             : 
    5106             : 
    5107        2170 :     XLogCtl = (XLogCtlData *)
    5108        2170 :         ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
    5109             : 
    5110        2170 :     localControlFile = ControlFile;
    5111        2170 :     ControlFile = (ControlFileData *)
    5112        2170 :         ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
    5113             : 
    5114        2170 :     if (foundCFile || foundXLog)
    5115             :     {
    5116             :         /* both should be present or neither */
    5117             :         Assert(foundCFile && foundXLog);
    5118             : 
    5119             :         /* Initialize local copy of WALInsertLocks */
    5120           0 :         WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
    5121             : 
    5122           0 :         if (localControlFile)
    5123           0 :             pfree(localControlFile);
    5124           0 :         return;
    5125             :     }
    5126        2170 :     memset(XLogCtl, 0, sizeof(XLogCtlData));
    5127             : 
    5128             :     /*
    5129             :      * Already have read control file locally, unless in bootstrap mode. Move
    5130             :      * contents into shared memory.
    5131             :      */
    5132        2170 :     if (localControlFile)
    5133             :     {
    5134        1096 :         memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
    5135        1096 :         pfree(localControlFile);
    5136             :     }
    5137             : 
    5138             :     /*
    5139             :      * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
    5140             :      * multiple of the alignment for same, so no extra alignment padding is
    5141             :      * needed here.
    5142             :      */
    5143        2170 :     allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
    5144        2170 :     XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
    5145        2170 :     memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
    5146        2170 :     allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
    5147             : 
    5148             : 
    5149             :     /* WAL insertion locks. Ensure they're aligned to the full padded size */
    5150        2170 :     allocptr += sizeof(WALInsertLockPadded) -
    5151        2170 :         ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
    5152        2170 :     WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
    5153             :         (WALInsertLockPadded *) allocptr;
    5154        2170 :     allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
    5155             : 
    5156       19530 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    5157             :     {
    5158       17360 :         LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
    5159       17360 :         WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
    5160       17360 :         WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
    5161             :     }
    5162             : 
    5163             :     /*
    5164             :      * Align the start of the page buffers to a full xlog block size boundary.
    5165             :      * This simplifies some calculations in XLOG insertion. It is also
    5166             :      * required for O_DIRECT.
    5167             :      */
    5168        2170 :     allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
    5169        2170 :     XLogCtl->pages = allocptr;
    5170        2170 :     memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
    5171             : 
    5172             :     /*
    5173             :      * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
    5174             :      * in additional info.)
    5175             :      */
    5176        2170 :     XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
    5177        2170 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
    5178        2170 :     XLogCtl->SharedHotStandbyActive = false;
    5179        2170 :     XLogCtl->SharedPromoteIsTriggered = false;
    5180        2170 :     XLogCtl->WalWriterSleeping = false;
    5181             : 
    5182        2170 :     SpinLockInit(&XLogCtl->Insert.insertpos_lck);
    5183        2170 :     SpinLockInit(&XLogCtl->info_lck);
    5184        2170 :     SpinLockInit(&XLogCtl->ulsn_lck);
    5185        2170 :     InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
    5186             : }
    5187             : 
    5188             : /*
    5189             :  * This func must be called ONCE on system install.  It creates pg_control
    5190             :  * and the initial XLOG segment.
    5191             :  */
    5192             : void
    5193         358 : BootStrapXLOG(void)
    5194             : {
    5195             :     CheckPoint  checkPoint;
    5196             :     char       *buffer;
    5197             :     XLogPageHeader page;
    5198             :     XLogLongPageHeader longpage;
    5199             :     XLogRecord *record;
    5200             :     char       *recptr;
    5201             :     bool        use_existent;
    5202             :     uint64      sysidentifier;
    5203             :     struct timeval tv;
    5204             :     pg_crc32c   crc;
    5205             : 
    5206             :     /*
    5207             :      * Select a hopefully-unique system identifier code for this installation.
    5208             :      * We use the result of gettimeofday(), including the fractional seconds
    5209             :      * field, as being about as unique as we can easily get.  (Think not to
    5210             :      * use random(), since it hasn't been seeded and there's no portable way
    5211             :      * to seed it other than the system clock value...)  The upper half of the
    5212             :      * uint64 value is just the tv_sec part, while the lower half contains the
    5213             :      * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
    5214             :      * PID for a little extra uniqueness.  A person knowing this encoding can
    5215             :      * determine the initialization time of the installation, which could
    5216             :      * perhaps be useful sometimes.
    5217             :      */
    5218         358 :     gettimeofday(&tv, NULL);
    5219         358 :     sysidentifier = ((uint64) tv.tv_sec) << 32;
    5220         358 :     sysidentifier |= ((uint64) tv.tv_usec) << 12;
    5221         358 :     sysidentifier |= getpid() & 0xFFF;
    5222             : 
    5223             :     /* First timeline ID is always 1 */
    5224         358 :     ThisTimeLineID = 1;
    5225             : 
    5226             :     /* page buffer must be aligned suitably for O_DIRECT */
    5227         358 :     buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
    5228         358 :     page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
    5229         358 :     memset(page, 0, XLOG_BLCKSZ);
    5230             : 
    5231             :     /*
    5232             :      * Set up information for the initial checkpoint record
    5233             :      *
    5234             :      * The initial checkpoint record is written to the beginning of the WAL
    5235             :      * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
    5236             :      * used, so that we can use 0/0 to mean "before any valid WAL segment".
    5237             :      */
    5238         358 :     checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
    5239         358 :     checkPoint.ThisTimeLineID = ThisTimeLineID;
    5240         358 :     checkPoint.PrevTimeLineID = ThisTimeLineID;
    5241         358 :     checkPoint.fullPageWrites = fullPageWrites;
    5242             :     checkPoint.nextFullXid =
    5243         358 :         FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
    5244         358 :     checkPoint.nextOid = FirstBootstrapObjectId;
    5245         358 :     checkPoint.nextMulti = FirstMultiXactId;
    5246         358 :     checkPoint.nextMultiOffset = 0;
    5247         358 :     checkPoint.oldestXid = FirstNormalTransactionId;
    5248         358 :     checkPoint.oldestXidDB = TemplateDbOid;
    5249         358 :     checkPoint.oldestMulti = FirstMultiXactId;
    5250         358 :     checkPoint.oldestMultiDB = TemplateDbOid;
    5251         358 :     checkPoint.oldestCommitTsXid = InvalidTransactionId;
    5252         358 :     checkPoint.newestCommitTsXid = InvalidTransactionId;
    5253         358 :     checkPoint.time = (pg_time_t) time(NULL);
    5254         358 :     checkPoint.oldestActiveXid = InvalidTransactionId;
    5255             : 
    5256         358 :     ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
    5257         358 :     ShmemVariableCache->nextOid = checkPoint.nextOid;
    5258         358 :     ShmemVariableCache->oidCount = 0;
    5259         358 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    5260         358 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    5261         358 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    5262         358 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
    5263         358 :     SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
    5264             : 
    5265             :     /* Set up the XLOG page header */
    5266         358 :     page->xlp_magic = XLOG_PAGE_MAGIC;
    5267         358 :     page->xlp_info = XLP_LONG_HEADER;
    5268         358 :     page->xlp_tli = ThisTimeLineID;
    5269         358 :     page->xlp_pageaddr = wal_segment_size;
    5270         358 :     longpage = (XLogLongPageHeader) page;
    5271         358 :     longpage->xlp_sysid = sysidentifier;
    5272         358 :     longpage->xlp_seg_size = wal_segment_size;
    5273         358 :     longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    5274             : 
    5275             :     /* Insert the initial checkpoint record */
    5276         358 :     recptr = ((char *) page + SizeOfXLogLongPHD);
    5277         358 :     record = (XLogRecord *) recptr;
    5278         358 :     record->xl_prev = 0;
    5279         358 :     record->xl_xid = InvalidTransactionId;
    5280         358 :     record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
    5281         358 :     record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
    5282         358 :     record->xl_rmid = RM_XLOG_ID;
    5283         358 :     recptr += SizeOfXLogRecord;
    5284             :     /* fill the XLogRecordDataHeaderShort struct */
    5285         358 :     *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
    5286         358 :     *(recptr++) = sizeof(checkPoint);
    5287         358 :     memcpy(recptr, &checkPoint, sizeof(checkPoint));
    5288         358 :     recptr += sizeof(checkPoint);
    5289             :     Assert(recptr - (char *) record == record->xl_tot_len);
    5290             : 
    5291         358 :     INIT_CRC32C(crc);
    5292         358 :     COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
    5293         358 :     COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
    5294         358 :     FIN_CRC32C(crc);
    5295         358 :     record->xl_crc = crc;
    5296             : 
    5297             :     /* Create first XLOG segment file */
    5298         358 :     use_existent = false;
    5299         358 :     openLogFile = XLogFileInit(1, &use_existent, false);
    5300             : 
    5301             :     /*
    5302             :      * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
    5303             :      * close the file again in a moment.
    5304             :      */
    5305             : 
    5306             :     /* Write the first page with the initial record */
    5307         358 :     errno = 0;
    5308         358 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
    5309         358 :     if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    5310             :     {
    5311             :         /* if write didn't set errno, assume problem is no disk space */
    5312           0 :         if (errno == 0)
    5313           0 :             errno = ENOSPC;
    5314           0 :         ereport(PANIC,
    5315             :                 (errcode_for_file_access(),
    5316             :                  errmsg("could not write bootstrap write-ahead log file: %m")));
    5317             :     }
    5318         358 :     pgstat_report_wait_end();
    5319             : 
    5320         358 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
    5321         358 :     if (pg_fsync(openLogFile) != 0)
    5322           0 :         ereport(PANIC,
    5323             :                 (errcode_for_file_access(),
    5324             :                  errmsg("could not fsync bootstrap write-ahead log file: %m")));
    5325         358 :     pgstat_report_wait_end();
    5326             : 
    5327         358 :     if (close(openLogFile) != 0)
    5328           0 :         ereport(PANIC,
    5329             :                 (errcode_for_file_access(),
    5330             :                  errmsg("could not close bootstrap write-ahead log file: %m")));
    5331             : 
    5332         358 :     openLogFile = -1;
    5333             : 
    5334             :     /* Now create pg_control */
    5335         358 :     InitControlFile(sysidentifier);
    5336         358 :     ControlFile->time = checkPoint.time;
    5337         358 :     ControlFile->checkPoint = checkPoint.redo;
    5338         358 :     ControlFile->checkPointCopy = checkPoint;
    5339             : 
    5340             :     /* some additional ControlFile fields are set in WriteControlFile() */
    5341         358 :     WriteControlFile();
    5342             : 
    5343             :     /* Bootstrap the commit log, too */
    5344         358 :     BootStrapCLOG();
    5345         358 :     BootStrapCommitTs();
    5346         358 :     BootStrapSUBTRANS();
    5347         358 :     BootStrapMultiXact();
    5348             : 
    5349         358 :     pfree(buffer);
    5350             : 
    5351             :     /*
    5352             :      * Force control file to be read - in contrast to normal processing we'd
    5353             :      * otherwise never run the checks and GUC related initializations therein.
    5354             :      */
    5355         358 :     ReadControlFile();
    5356         358 : }
    5357             : 
    5358             : static char *
    5359         674 : str_time(pg_time_t tnow)
    5360             : {
    5361             :     static char buf[128];
    5362             : 
    5363         674 :     pg_strftime(buf, sizeof(buf),
    5364             :                 "%Y-%m-%d %H:%M:%S %Z",
    5365         674 :                 pg_localtime(&tnow, log_timezone));
    5366             : 
    5367         674 :     return buf;
    5368             : }
    5369             : 
    5370             : /*
    5371             :  * See if there are any recovery signal files and if so, set state for
    5372             :  * recovery.
    5373             :  *
    5374             :  * See if there is a recovery command file (recovery.conf), and if so
    5375             :  * throw an ERROR since as of PG12 we no longer recognize that.
    5376             :  */
    5377             : static void
    5378        1390 : readRecoverySignalFile(void)
    5379             : {
    5380             :     struct stat stat_buf;
    5381             : 
    5382        1390 :     if (IsBootstrapProcessingMode())
    5383        1308 :         return;
    5384             : 
    5385             :     /*
    5386             :      * Check for old recovery API file: recovery.conf
    5387             :      */
    5388        1032 :     if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
    5389           0 :         ereport(FATAL,
    5390             :                 (errcode_for_file_access(),
    5391             :                  errmsg("using recovery command file \"%s\" is not supported",
    5392             :                         RECOVERY_COMMAND_FILE)));
    5393             : 
    5394             :     /*
    5395             :      * Remove unused .done file, if present. Ignore if absent.
    5396             :      */
    5397        1032 :     unlink(RECOVERY_COMMAND_DONE);
    5398             : 
    5399             :     /*
    5400             :      * Check for recovery signal files and if found, fsync them since they
    5401             :      * represent server state information.  We don't sweat too much about the
    5402             :      * possibility of fsync failure, however.
    5403             :      *
    5404             :      * If present, standby signal file takes precedence. If neither is present
    5405             :      * then we won't enter archive recovery.
    5406             :      */
    5407        1032 :     if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
    5408             :     {
    5409             :         int         fd;
    5410             : 
    5411          80 :         fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
    5412             :                                S_IRUSR | S_IWUSR);
    5413          80 :         if (fd >= 0)
    5414             :         {
    5415          80 :             (void) pg_fsync(fd);
    5416          80 :             close(fd);
    5417             :         }
    5418          80 :         standby_signal_file_found = true;
    5419             :     }
    5420         952 :     else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
    5421             :     {
    5422             :         int         fd;
    5423             : 
    5424           2 :         fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
    5425             :                                S_IRUSR | S_IWUSR);
    5426           2 :         if (fd >= 0)
    5427             :         {
    5428           2 :             (void) pg_fsync(fd);
    5429           2 :             close(fd);
    5430             :         }
    5431           2 :         recovery_signal_file_found = true;
    5432             :     }
    5433             : 
    5434        1032 :     StandbyModeRequested = false;
    5435        1032 :     ArchiveRecoveryRequested = false;
    5436        1032 :     if (standby_signal_file_found)
    5437             :     {
    5438          80 :         StandbyModeRequested = true;
    5439          80 :         ArchiveRecoveryRequested = true;
    5440             :     }
    5441         952 :     else if (recovery_signal_file_found)
    5442             :     {
    5443           2 :         StandbyModeRequested = false;
    5444           2 :         ArchiveRecoveryRequested = true;
    5445             :     }
    5446             :     else
    5447         950 :         return;
    5448             : 
    5449             :     /*
    5450             :      * We don't support standby mode in standalone backends; that requires
    5451             :      * other processes such as the WAL receiver to be alive.
    5452             :      */
    5453          82 :     if (StandbyModeRequested && !IsUnderPostmaster)
    5454           0 :         ereport(FATAL,
    5455             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    5456             :                  errmsg("standby mode is not supported by single-user servers")));
    5457             : }
    5458             : 
    5459             : static void
    5460        1390 : validateRecoveryParameters(void)
    5461             : {
    5462        1390 :     if (!ArchiveRecoveryRequested)
    5463        1308 :         return;
    5464             : 
    5465             :     /*
    5466             :      * Check for compulsory parameters
    5467             :      */
    5468          82 :     if (StandbyModeRequested)
    5469             :     {
    5470          80 :         if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
    5471           4 :             (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
    5472           0 :             ereport(WARNING,
    5473             :                     (errmsg("specified neither primary_conninfo nor restore_command"),
    5474             :                      errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
    5475             :     }
    5476             :     else
    5477             :     {
    5478           2 :         if (recoveryRestoreCommand == NULL ||
    5479           2 :             strcmp(recoveryRestoreCommand, "") == 0)
    5480           0 :             ereport(FATAL,
    5481             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5482             :                      errmsg("must specify restore_command when standby mode is not enabled")));
    5483             :     }
    5484             : 
    5485             :     /*
    5486             :      * Override any inconsistent requests. Note that this is a change of
    5487             :      * behaviour in 9.5; prior to this we simply ignored a request to pause if
    5488             :      * hot_standby = off, which was surprising behaviour.
    5489             :      */
    5490          82 :     if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
    5491          82 :         !EnableHotStandby)
    5492           0 :         recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
    5493             : 
    5494             :     /*
    5495             :      * Final parsing of recovery_target_time string; see also
    5496             :      * check_recovery_target_time().
    5497             :      */
    5498          82 :     if (recoveryTarget == RECOVERY_TARGET_TIME)
    5499             :     {
    5500           0 :         recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
    5501             :                                                                      CStringGetDatum(recovery_target_time_string),
    5502             :                                                                      ObjectIdGetDatum(InvalidOid),
    5503             :                                                                      Int32GetDatum(-1)));
    5504             :     }
    5505             : 
    5506             :     /*
    5507             :      * If user specified recovery_target_timeline, validate it or compute the
    5508             :      * "latest" value.  We can't do this until after we've gotten the restore
    5509             :      * command and set InArchiveRecovery, because we need to fetch timeline
    5510             :      * history files from the archive.
    5511             :      */
    5512          82 :     if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
    5513             :     {
    5514           0 :         TimeLineID  rtli = recoveryTargetTLIRequested;
    5515             : 
    5516             :         /* Timeline 1 does not have a history file, all else should */
    5517           0 :         if (rtli != 1 && !existsTimeLineHistory(rtli))
    5518           0 :             ereport(FATAL,
    5519             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5520             :                      errmsg("recovery target timeline %u does not exist",
    5521             :                             rtli)));
    5522           0 :         recoveryTargetTLI = rtli;
    5523             :     }
    5524          82 :     else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
    5525             :     {
    5526             :         /* We start the "latest" search from pg_control's timeline */
    5527          82 :         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
    5528             :     }
    5529             :     else
    5530             :     {
    5531             :         /*
    5532             :          * else we just use the recoveryTargetTLI as already read from
    5533             :          * ControlFile
    5534             :          */
    5535             :         Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
    5536             :     }
    5537             : }
    5538             : 
    5539             : /*
    5540             :  * Exit archive-recovery state
    5541             :  */
    5542             : static void
    5543          48 : exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
    5544             : {
    5545             :     char        xlogfname[MAXFNAMELEN];
    5546             :     XLogSegNo   endLogSegNo;
    5547             :     XLogSegNo   startLogSegNo;
    5548             : 
    5549             :     /* we always switch to a new timeline after archive recovery */
    5550             :     Assert(endTLI != ThisTimeLineID);
    5551             : 
    5552             :     /*
    5553             :      * We are no longer in archive recovery state.
    5554             :      */
    5555          48 :     InArchiveRecovery = false;
    5556             : 
    5557             :     /*
    5558             :      * Update min recovery point one last time.
    5559             :      */
    5560          48 :     UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    5561             : 
    5562             :     /*
    5563             :      * If the ending log segment is still open, close it (to avoid problems on
    5564             :      * Windows with trying to rename or delete an open file).
    5565             :      */
    5566          48 :     if (readFile >= 0)
    5567             :     {
    5568          48 :         close(readFile);
    5569          48 :         readFile = -1;
    5570             :     }
    5571             : 
    5572             :     /*
    5573             :      * Calculate the last segment on the old timeline, and the first segment
    5574             :      * on the new timeline. If the switch happens in the middle of a segment,
    5575             :      * they are the same, but if the switch happens exactly at a segment
    5576             :      * boundary, startLogSegNo will be endLogSegNo + 1.
    5577             :      */
    5578          48 :     XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
    5579          48 :     XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
    5580             : 
    5581             :     /*
    5582             :      * Initialize the starting WAL segment for the new timeline. If the switch
    5583             :      * happens in the middle of a segment, copy data from the last WAL segment
    5584             :      * of the old timeline up to the switch point, to the starting WAL segment
    5585             :      * on the new timeline.
    5586             :      */
    5587          48 :     if (endLogSegNo == startLogSegNo)
    5588             :     {
    5589             :         /*
    5590             :          * Make a copy of the file on the new timeline.
    5591             :          *
    5592             :          * Writing WAL isn't allowed yet, so there are no locking
    5593             :          * considerations. But we should be just as tense as XLogFileInit to
    5594             :          * avoid emplacing a bogus file.
    5595             :          */
    5596          36 :         XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
    5597          36 :                      XLogSegmentOffset(endOfLog, wal_segment_size));
    5598             :     }
    5599             :     else
    5600             :     {
    5601             :         /*
    5602             :          * The switch happened at a segment boundary, so just create the next
    5603             :          * segment on the new timeline.
    5604             :          */
    5605          12 :         bool        use_existent = true;
    5606             :         int         fd;
    5607             : 
    5608          12 :         fd = XLogFileInit(startLogSegNo, &use_existent, true);
    5609             : 
    5610          12 :         if (close(fd) != 0)
    5611             :         {
    5612             :             char        xlogfname[MAXFNAMELEN];
    5613           0 :             int         save_errno = errno;
    5614             : 
    5615           0 :             XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo,
    5616             :                          wal_segment_size);
    5617           0 :             errno = save_errno;
    5618           0 :             ereport(ERROR,
    5619             :                     (errcode_for_file_access(),
    5620             :                      errmsg("could not close file \"%s\": %m", xlogfname)));
    5621             :         }
    5622             :     }
    5623             : 
    5624             :     /*
    5625             :      * Let's just make real sure there are not .ready or .done flags posted
    5626             :      * for the new segment.
    5627             :      */
    5628          48 :     XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
    5629          48 :     XLogArchiveCleanup(xlogfname);
    5630             : 
    5631             :     /*
    5632             :      * Remove the signal files out of the way, so that we don't accidentally
    5633             :      * re-enter archive recovery mode in a subsequent crash.
    5634             :      */
    5635          48 :     if (standby_signal_file_found)
    5636          48 :         durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
    5637             : 
    5638          48 :     if (recovery_signal_file_found)
    5639           0 :         durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
    5640             : 
    5641          48 :     ereport(LOG,
    5642             :             (errmsg("archive recovery complete")));
    5643          48 : }
    5644             : 
    5645             : /*
    5646             :  * Extract timestamp from WAL record.
    5647             :  *
    5648             :  * If the record contains a timestamp, returns true, and saves the timestamp
    5649             :  * in *recordXtime. If the record type has no timestamp, returns false.
    5650             :  * Currently, only transaction commit/abort records and restore points contain
    5651             :  * timestamps.
    5652             :  */
    5653             : static bool
    5654         264 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
    5655             : {
    5656         264 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    5657         264 :     uint8       xact_info = info & XLOG_XACT_OPMASK;
    5658         264 :     uint8       rmid = XLogRecGetRmid(record);
    5659             : 
    5660         264 :     if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
    5661             :     {
    5662           0 :         *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
    5663           0 :         return true;
    5664             :     }
    5665         264 :     if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
    5666             :                                xact_info == XLOG_XACT_COMMIT_PREPARED))
    5667             :     {
    5668         256 :         *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
    5669         256 :         return true;
    5670             :     }
    5671           8 :     if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
    5672             :                                xact_info == XLOG_XACT_ABORT_PREPARED))
    5673             :     {
    5674           8 :         *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
    5675           8 :         return true;
    5676             :     }
    5677           0 :     return false;
    5678             : }
    5679             : 
    5680             : /*
    5681             :  * For point-in-time recovery, this function decides whether we want to
    5682             :  * stop applying the XLOG before the current record.
    5683             :  *
    5684             :  * Returns true if we are stopping, false otherwise. If stopping, some
    5685             :  * information is saved in recoveryStopXid et al for use in annotating the
    5686             :  * new timeline's history file.
    5687             :  */
    5688             : static bool
    5689      573454 : recoveryStopsBefore(XLogReaderState *record)
    5690             : {
    5691      573454 :     bool        stopsHere = false;
    5692             :     uint8       xact_info;
    5693             :     bool        isCommit;
    5694      573454 :     TimestampTz recordXtime = 0;
    5695             :     TransactionId recordXid;
    5696             : 
    5697             :     /*
    5698             :      * Ignore recovery target settings when not in archive recovery (meaning
    5699             :      * we are in crash recovery).
    5700             :      */
    5701      573454 :     if (!ArchiveRecoveryRequested)
    5702      370862 :         return false;
    5703             : 
    5704             :     /* Check if we should stop as soon as reaching consistency */
    5705      202592 :     if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
    5706             :     {
    5707           0 :         ereport(LOG,
    5708             :                 (errmsg("recovery stopping after reaching consistency")));
    5709             : 
    5710           0 :         recoveryStopAfter = false;
    5711           0 :         recoveryStopXid = InvalidTransactionId;
    5712           0 :         recoveryStopLSN = InvalidXLogRecPtr;
    5713           0 :         recoveryStopTime = 0;
    5714           0 :         recoveryStopName[0] = '\0';
    5715           0 :         return true;
    5716             :     }
    5717             : 
    5718             :     /* Check if target LSN has been reached */
    5719      202592 :     if (recoveryTarget == RECOVERY_TARGET_LSN &&
    5720           0 :         !recoveryTargetInclusive &&
    5721           0 :         record->ReadRecPtr >= recoveryTargetLSN)
    5722             :     {
    5723           0 :         recoveryStopAfter = false;
    5724           0 :         recoveryStopXid = InvalidTransactionId;
    5725           0 :         recoveryStopLSN = record->ReadRecPtr;
    5726           0 :         recoveryStopTime = 0;
    5727           0 :         recoveryStopName[0] = '\0';
    5728           0 :         ereport(LOG,
    5729             :                 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
    5730             :                         (uint32) (recoveryStopLSN >> 32),
    5731             :                         (uint32) recoveryStopLSN)));
    5732           0 :         return true;
    5733             :     }
    5734             : 
    5735             :     /* Otherwise we only consider stopping before COMMIT or ABORT records. */
    5736      202592 :     if (XLogRecGetRmid(record) != RM_XACT_ID)
    5737      202246 :         return false;
    5738             : 
    5739         346 :     xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
    5740             : 
    5741         346 :     if (xact_info == XLOG_XACT_COMMIT)
    5742             :     {
    5743         226 :         isCommit = true;
    5744         226 :         recordXid = XLogRecGetXid(record);
    5745             :     }
    5746         120 :     else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
    5747             :     {
    5748          30 :         xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
    5749             :         xl_xact_parsed_commit parsed;
    5750             : 
    5751          30 :         isCommit = true;
    5752          30 :         ParseCommitRecord(XLogRecGetInfo(record),
    5753             :                           xlrec,
    5754             :                           &parsed);
    5755          30 :         recordXid = parsed.twophase_xid;
    5756             :     }
    5757          90 :     else if (xact_info == XLOG_XACT_ABORT)
    5758             :     {
    5759           2 :         isCommit = false;
    5760           2 :         recordXid = XLogRecGetXid(record);
    5761             :     }
    5762          88 :     else if (xact_info == XLOG_XACT_ABORT_PREPARED)
    5763             :     {
    5764           6 :         xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
    5765             :         xl_xact_parsed_abort parsed;
    5766             : 
    5767           6 :         isCommit = true;
    5768           6 :         ParseAbortRecord(XLogRecGetInfo(record),
    5769             :                          xlrec,
    5770             :                          &parsed);
    5771           6 :         recordXid = parsed.twophase_xid;
    5772             :     }
    5773             :     else
    5774          82 :         return false;
    5775             : 
    5776         264 :     if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
    5777             :     {
    5778             :         /*
    5779             :          * There can be only one transaction end record with this exact
    5780             :          * transactionid
    5781             :          *
    5782             :          * when testing for an xid, we MUST test for equality only, since
    5783             :          * transactions are numbered in the order they start, not the order
    5784             :          * they complete. A higher numbered xid will complete before you about
    5785             :          * 50% of the time...
    5786             :          */
    5787           0 :         stopsHere = (recordXid == recoveryTargetXid);
    5788             :     }
    5789             : 
    5790         264 :     if (recoveryTarget == RECOVERY_TARGET_TIME &&
    5791           0 :         getRecordTimestamp(record, &recordXtime))
    5792             :     {
    5793             :         /*
    5794             :          * There can be many transactions that share the same commit time, so
    5795             :          * we stop after the last one, if we are inclusive, or stop at the
    5796             :          * first one if we are exclusive
    5797             :          */
    5798           0 :         if (recoveryTargetInclusive)
    5799           0 :             stopsHere = (recordXtime > recoveryTargetTime);
    5800             :         else
    5801           0 :             stopsHere = (recordXtime >= recoveryTargetTime);
    5802             :     }
    5803             : 
    5804         264 :     if (stopsHere)
    5805             :     {
    5806           0 :         recoveryStopAfter = false;
    5807           0 :         recoveryStopXid = recordXid;
    5808           0 :         recoveryStopTime = recordXtime;
    5809           0 :         recoveryStopLSN = InvalidXLogRecPtr;
    5810           0 :         recoveryStopName[0] = '\0';
    5811             : 
    5812           0 :         if (isCommit)
    5813             :         {
    5814           0 :             ereport(LOG,
    5815             :                     (errmsg("recovery stopping before commit of transaction %u, time %s",
    5816             :                             recoveryStopXid,
    5817             :                             timestamptz_to_str(recoveryStopTime))));
    5818             :         }
    5819             :         else
    5820             :         {
    5821           0 :             ereport(LOG,
    5822             :                     (errmsg("recovery stopping before abort of transaction %u, time %s",
    5823             :                             recoveryStopXid,
    5824             :                             timestamptz_to_str(recoveryStopTime))));
    5825             :         }
    5826             :     }
    5827             : 
    5828         264 :     return stopsHere;
    5829             : }
    5830             : 
    5831             : /*
    5832             :  * Same as recoveryStopsBefore, but called after applying the record.
    5833             :  *
    5834             :  * We also track the timestamp of the latest applied COMMIT/ABORT
    5835             :  * record in XLogCtl->recoveryLastXTime.
    5836             :  */
    5837             : static bool
    5838      573454 : recoveryStopsAfter(XLogReaderState *record)
    5839             : {
    5840             :     uint8       info;
    5841             :     uint8       xact_info;
    5842             :     uint8       rmid;
    5843             :     TimestampTz recordXtime;
    5844             : 
    5845             :     /*
    5846             :      * Ignore recovery target settings when not in archive recovery (meaning
    5847             :      * we are in crash recovery).
    5848             :      */
    5849      573454 :     if (!ArchiveRecoveryRequested)
    5850      370862 :         return false;
    5851             : 
    5852      202592 :     info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    5853      202592 :     rmid = XLogRecGetRmid(record);
    5854             : 
    5855             :     /*
    5856             :      * There can be many restore points that share the same name; we stop at
    5857             :      * the first one.
    5858             :      */
    5859      202592 :     if (recoveryTarget == RECOVERY_TARGET_NAME &&
    5860          14 :         rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
    5861             :     {
    5862             :         xl_restore_point *recordRestorePointData;
    5863             : 
    5864           2 :         recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
    5865             : 
    5866           2 :         if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
    5867             :         {
    5868           0 :             recoveryStopAfter = true;
    5869           0 :             recoveryStopXid = InvalidTransactionId;
    5870           0 :             recoveryStopLSN = InvalidXLogRecPtr;
    5871           0 :             (void) getRecordTimestamp(record, &recoveryStopTime);
    5872           0 :             strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
    5873             : 
    5874           0 :             ereport(LOG,
    5875             :                     (errmsg("recovery stopping at restore point \"%s\", time %s",
    5876             :                             recoveryStopName,
    5877             :                             timestamptz_to_str(recoveryStopTime))));
    5878           0 :             return true;
    5879             :         }
    5880             :     }
    5881             : 
    5882             :     /* Check if the target LSN has been reached */
    5883      202592 :     if (recoveryTarget == RECOVERY_TARGET_LSN &&
    5884           0 :         recoveryTargetInclusive &&
    5885           0 :         record->ReadRecPtr >= recoveryTargetLSN)
    5886             :     {
    5887           0 :         recoveryStopAfter = true;
    5888           0 :         recoveryStopXid = InvalidTransactionId;
    5889           0 :         recoveryStopLSN = record->ReadRecPtr;
    5890           0 :         recoveryStopTime = 0;
    5891           0 :         recoveryStopName[0] = '\0';
    5892           0 :         ereport(LOG,
    5893             :                 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
    5894             :                         (uint32) (recoveryStopLSN >> 32),
    5895             :                         (uint32) recoveryStopLSN)));
    5896           0 :         return true;
    5897             :     }
    5898             : 
    5899      202592 :     if (rmid != RM_XACT_ID)
    5900      202246 :         return false;
    5901             : 
    5902         346 :     xact_info = info & XLOG_XACT_OPMASK;
    5903             : 
    5904         346 :     if (xact_info == XLOG_XACT_COMMIT ||
    5905          90 :         xact_info == XLOG_XACT_COMMIT_PREPARED ||
    5906          88 :         xact_info == XLOG_XACT_ABORT ||
    5907             :         xact_info == XLOG_XACT_ABORT_PREPARED)
    5908             :     {
    5909             :         TransactionId recordXid;
    5910             : 
    5911             :         /* Update the last applied transaction timestamp */
    5912         264 :         if (getRecordTimestamp(record, &recordXtime))
    5913         264 :             SetLatestXTime(recordXtime);
    5914             : 
    5915             :         /* Extract the XID of the committed/aborted transaction */
    5916         264 :         if (xact_info == XLOG_XACT_COMMIT_PREPARED)
    5917             :         {
    5918          30 :             xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
    5919             :             xl_xact_parsed_commit parsed;
    5920             : 
    5921          30 :             ParseCommitRecord(XLogRecGetInfo(record),
    5922             :                               xlrec,
    5923             :                               &parsed);
    5924          30 :             recordXid = parsed.twophase_xid;
    5925             :         }
    5926         234 :         else if (xact_info == XLOG_XACT_ABORT_PREPARED)
    5927             :         {
    5928           6 :             xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
    5929             :             xl_xact_parsed_abort parsed;
    5930             : 
    5931           6 :             ParseAbortRecord(XLogRecGetInfo(record),
    5932             :                              xlrec,
    5933             :                              &parsed);
    5934           6 :             recordXid = parsed.twophase_xid;
    5935             :         }
    5936             :         else
    5937         228 :             recordXid = XLogRecGetXid(record);
    5938             : 
    5939             :         /*
    5940             :          * There can be only one transaction end record with this exact
    5941             :          * transactionid
    5942             :          *
    5943             :          * when testing for an xid, we MUST test for equality only, since
    5944             :          * transactions are numbered in the order they start, not the order
    5945             :          * they complete. A higher numbered xid will complete before you about
    5946             :          * 50% of the time...
    5947             :          */
    5948         264 :         if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
    5949           0 :             recordXid == recoveryTargetXid)
    5950             :         {
    5951           0 :             recoveryStopAfter = true;
    5952           0 :             recoveryStopXid = recordXid;
    5953           0 :             recoveryStopTime = recordXtime;
    5954           0 :             recoveryStopLSN = InvalidXLogRecPtr;
    5955           0 :             recoveryStopName[0] = '\0';
    5956             : 
    5957           0 :             if (xact_info == XLOG_XACT_COMMIT ||
    5958             :                 xact_info == XLOG_XACT_COMMIT_PREPARED)
    5959             :             {
    5960           0 :                 ereport(LOG,
    5961             :                         (errmsg("recovery stopping after commit of transaction %u, time %s",
    5962             :                                 recoveryStopXid,
    5963             :                                 timestamptz_to_str(recoveryStopTime))));
    5964             :             }
    5965           0 :             else if (xact_info == XLOG_XACT_ABORT ||
    5966             :                      xact_info == XLOG_XACT_ABORT_PREPARED)
    5967             :             {
    5968           0 :                 ereport(LOG,
    5969             :                         (errmsg("recovery stopping after abort of transaction %u, time %s",
    5970             :                                 recoveryStopXid,
    5971             :                                 timestamptz_to_str(recoveryStopTime))));
    5972             :             }
    5973           0 :             return true;
    5974             :         }
    5975             :     }
    5976             : 
    5977             :     /* Check if we should stop as soon as reaching consistency */
    5978         346 :     if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
    5979             :     {
    5980           0 :         ereport(LOG,
    5981             :                 (errmsg("recovery stopping after reaching consistency")));
    5982             : 
    5983           0 :         recoveryStopAfter = true;
    5984           0 :         recoveryStopXid = InvalidTransactionId;
    5985           0 :         recoveryStopTime = 0;
    5986           0 :         recoveryStopLSN = InvalidXLogRecPtr;
    5987           0 :         recoveryStopName[0] = '\0';
    5988           0 :         return true;
    5989             :     }
    5990             : 
    5991         346 :     return false;
    5992             : }
    5993             : 
    5994             : /*
    5995             :  * Wait until shared recoveryPause flag is cleared.
    5996             :  *
    5997             :  * endOfRecovery is true if the recovery target is reached and
    5998             :  * the paused state starts at the end of recovery because of
    5999             :  * recovery_target_action=pause, and false otherwise.
    6000             :  *
    6001             :  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
    6002             :  * Probably not worth the trouble though.  This state shouldn't be one that
    6003             :  * anyone cares about server power consumption in.
    6004             :  */
    6005             : static void
    6006           0 : recoveryPausesHere(bool endOfRecovery)
    6007             : {
    6008             :     /* Don't pause unless users can connect! */
    6009           0 :     if (!LocalHotStandbyActive)
    6010           0 :         return;
    6011             : 
    6012             :     /* Don't pause after standby promotion has been triggered */
    6013           0 :     if (LocalPromoteIsTriggered)
    6014           0 :         return;
    6015             : 
    6016           0 :     if (endOfRecovery)
    6017           0 :         ereport(LOG,
    6018             :                 (errmsg("pausing at the end of recovery"),
    6019             :                  errhint("Execute pg_wal_replay_resume() to promote.")));
    6020             :     else
    6021           0 :         ereport(LOG,
    6022             :                 (errmsg("recovery has paused"),
    6023             :                  errhint("Execute pg_wal_replay_resume() to continue.")));
    6024             : 
    6025           0 :     while (RecoveryIsPaused())
    6026             :     {
    6027           0 :         HandleStartupProcInterrupts();
    6028           0 :         if (CheckForStandbyTrigger())
    6029           0 :             return;
    6030           0 :         pgstat_report_wait_start(WAIT_EVENT_RECOVERY_PAUSE);
    6031           0 :         pg_usleep(1000000L);    /* 1000 ms */
    6032           0 :         pgstat_report_wait_end();
    6033             :     }
    6034             : }
    6035             : 
    6036             : bool
    6037           0 : RecoveryIsPaused(void)
    6038             : {
    6039             :     bool        recoveryPause;
    6040             : 
    6041           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    6042           0 :     recoveryPause = XLogCtl->recoveryPause;
    6043           0 :     SpinLockRelease(&XLogCtl->info_lck);
    6044             : 
    6045           0 :     return recoveryPause;
    6046             : }
    6047             : 
    6048             : void
    6049           0 : SetRecoveryPause(bool recoveryPause)
    6050             : {
    6051           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    6052           0 :     XLogCtl->recoveryPause = recoveryPause;
    6053           0 :     SpinLockRelease(&XLogCtl->info_lck);
    6054           0 : }
    6055             : 
    6056             : /*
    6057             :  * When recovery_min_apply_delay is set, we wait long enough to make sure
    6058             :  * certain record types are applied at least that interval behind the master.
    6059             :  *
    6060             :  * Returns true if we waited.
    6061             :  *
    6062             :  * Note that the delay is calculated between the WAL record log time and
    6063             :  * the current time on standby. We would prefer to keep track of when this
    6064             :  * standby received each WAL record, which would allow a more consistent
    6065             :  * approach and one not affected by time synchronisation issues, but that
    6066             :  * is significantly more effort and complexity for little actual gain in
    6067             :  * usability.
    6068             :  */
    6069             : static bool
    6070      573454 : recoveryApplyDelay(XLogReaderState *record)
    6071             : {
    6072             :     uint8       xact_info;
    6073             :     TimestampTz xtime;
    6074             :     TimestampTz delayUntil;
    6075             :     long        secs;
    6076             :     int         microsecs;
    6077             : 
    6078             :     /* nothing to do if no delay configured */
    6079      573454 :     if (recovery_min_apply_delay <= 0)
    6080      573454 :         return false;
    6081             : 
    6082             :     /* no delay is applied on a database not yet consistent */
    6083           0 :     if (!reachedConsistency)
    6084           0 :         return false;
    6085             : 
    6086             :     /* nothing to do if crash recovery is requested */
    6087           0 :     if (!ArchiveRecoveryRequested)
    6088           0 :         return false;
    6089             : 
    6090             :     /*
    6091             :      * Is it a COMMIT record?
    6092             :      *
    6093             :      * We deliberately choose not to delay aborts since they have no effect on
    6094             :      * MVCC. We already allow replay of records that don't have a timestamp,
    6095             :      * so there is already opportunity for issues caused by early conflicts on
    6096             :      * standbys.
    6097             :      */
    6098           0 :     if (XLogRecGetRmid(record) != RM_XACT_ID)
    6099           0 :         return false;
    6100             : 
    6101           0 :     xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
    6102             : 
    6103           0 :     if (xact_info != XLOG_XACT_COMMIT &&
    6104             :         xact_info != XLOG_XACT_COMMIT_PREPARED)
    6105           0 :         return false;
    6106             : 
    6107           0 :     if (!getRecordTimestamp(record, &xtime))
    6108           0 :         return false;
    6109             : 
    6110           0 :     delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
    6111             : 
    6112             :     /*
    6113             :      * Exit without arming the latch if it's already past time to apply this
    6114             :      * record
    6115             :      */
    6116           0 :     TimestampDifference(GetCurrentTimestamp(), delayUntil, &secs, &microsecs);
    6117           0 :     if (secs <= 0 && microsecs <= 0)
    6118           0 :         return false;
    6119             : 
    6120             :     while (true)
    6121             :     {
    6122           0 :         ResetLatch(&XLogCtl->recoveryWakeupLatch);
    6123             : 
    6124             :         /* might change the trigger file's location */
    6125           0 :         HandleStartupProcInterrupts();
    6126             : 
    6127           0 :         if (CheckForStandbyTrigger())
    6128           0 :             break;
    6129             : 
    6130             :         /*
    6131             :          * Wait for difference between GetCurrentTimestamp() and delayUntil
    6132             :          */
    6133           0 :         TimestampDifference(GetCurrentTimestamp(), delayUntil,
    6134             :                             &secs, &microsecs);
    6135             : 
    6136             :         /*
    6137             :          * NB: We're ignoring waits below recovery_min_apply_delay's
    6138             :          * resolution.
    6139             :          */
    6140           0 :         if (secs <= 0 && microsecs / 1000 <= 0)
    6141           0 :             break;
    6142             : 
    6143           0 :         elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
    6144             :              secs, microsecs / 1000);
    6145             : 
    6146           0 :         (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
    6147             :                          WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
    6148           0 :                          secs * 1000L + microsecs / 1000,
    6149             :                          WAIT_EVENT_RECOVERY_APPLY_DELAY);
    6150             :     }
    6151           0 :     return true;
    6152             : }
    6153             : 
    6154             : /*
    6155             :  * Save timestamp of latest processed commit/abort record.
    6156             :  *
    6157             :  * We keep this in XLogCtl, not a simple static variable, so that it can be
    6158             :  * seen by processes other than the startup process.  Note in particular
    6159             :  * that CreateRestartPoint is executed in the checkpointer.
    6160             :  */
    6161             : static void
    6162         264 : SetLatestXTime(TimestampTz xtime)
    6163             : {
    6164         264 :     SpinLockAcquire(&XLogCtl->info_lck);
    6165         264 :     XLogCtl->recoveryLastXTime = xtime;
    6166         264 :     SpinLockRelease(&XLogCtl->info_lck);
    6167         264 : }
    6168             : 
    6169             : /*
    6170             :  * Fetch timestamp of latest processed commit/abort record.
    6171             :  */
    6172             : TimestampTz
    6173         186 : GetLatestXTime(void)
    6174             : {
    6175             :     TimestampTz xtime;
    6176             : 
    6177         186 :     SpinLockAcquire(&XLogCtl->info_lck);
    6178         186 :     xtime = XLogCtl->recoveryLastXTime;
    6179         186 :     SpinLockRelease(&XLogCtl->info_lck);
    6180             : 
    6181         186 :     return xtime;
    6182             : }
    6183             : 
    6184             : /*
    6185             :  * Save timestamp of the next chunk of WAL records to apply.
    6186             :  *
    6187             :  * We keep this in XLogCtl, not a simple static variable, so that it can be
    6188             :  * seen by all backends.
    6189             :  */
    6190             : static void
    6191         394 : SetCurrentChunkStartTime(TimestampTz xtime)
    6192             : {
    6193         394 :     SpinLockAcquire(&XLogCtl->info_lck);
    6194         394 :     XLogCtl->currentChunkStartTime = xtime;
    6195         394 :     SpinLockRelease(&XLogCtl->info_lck);
    6196         394 : }
    6197             : 
    6198             : /*
    6199             :  * Fetch timestamp of latest processed commit/abort record.
    6200             :  * Startup process maintains an accurate local copy in XLogReceiptTime
    6201             :  */
    6202             : TimestampTz
    6203           0 : GetCurrentChunkReplayStartTime(void)
    6204             : {
    6205             :     TimestampTz xtime;
    6206             : 
    6207           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    6208           0 :     xtime = XLogCtl->currentChunkStartTime;
    6209           0 :     SpinLockRelease(&XLogCtl->info_lck);
    6210             : 
    6211           0 :     return xtime;
    6212             : }
    6213             : 
    6214             : /*
    6215             :  * Returns time of receipt of current chunk of XLOG data, as well as
    6216             :  * whether it was received from streaming replication or from archives.
    6217             :  */
    6218             : void
    6219           0 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
    6220             : {
    6221             :     /*
    6222             :      * This must be executed in the startup process, since we don't export the
    6223             :      * relevant state to shared memory.
    6224             :      */
    6225             :     Assert(InRecovery);
    6226             : 
    6227           0 :     *rtime = XLogReceiptTime;
    6228           0 :     *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
    6229           0 : }
    6230             : 
    6231             : /*
    6232             :  * Note that text field supplied is a parameter name and does not require
    6233             :  * translation
    6234             :  */
    6235             : #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
    6236             : do { \
    6237             :     if ((currValue) < (minValue)) \
    6238             :         ereport(ERROR, \
    6239             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
    6240             :                  errmsg("hot standby is not possible because %s = %d is a lower setting than on the master server (its value was %d)", \
    6241             :                         param_name, \
    6242             :                         currValue, \
    6243             :                         minValue))); \
    6244             : } while(0)
    6245             : 
    6246             : /*
    6247             :  * Check to see if required parameters are set high enough on this server
    6248             :  * for various aspects of recovery operation.
    6249             :  *
    6250             :  * Note that all the parameters which this function tests need to be
    6251             :  * listed in Administrator's Overview section in high-availability.sgml.
    6252             :  * If you change them, don't forget to update the list.
    6253             :  */
    6254             : static void
    6255         214 : CheckRequiredParameterValues(void)
    6256             : {
    6257             :     /*
    6258             :      * For archive recovery, the WAL must be generated with at least 'replica'
    6259             :      * wal_level.
    6260             :      */
    6261         214 :     if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
    6262             :     {
    6263           0 :         ereport(WARNING,
    6264             :                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
    6265             :                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
    6266             :     }
    6267             : 
    6268             :     /*
    6269             :      * For Hot Standby, the WAL must be generated with 'replica' mode, and we
    6270             :      * must have at least as many backend slots as the primary.
    6271             :      */
    6272         214 :     if (ArchiveRecoveryRequested && EnableHotStandby)
    6273             :     {
    6274          86 :         if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
    6275           0 :             ereport(ERROR,
    6276             :                     (errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
    6277             :                      errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
    6278             : 
    6279             :         /* We ignore autovacuum_max_workers when we make this test. */
    6280          86 :         RecoveryRequiresIntParameter("max_connections",
    6281             :                                      MaxConnections,
    6282             :                                      ControlFile->MaxConnections);
    6283          86 :         RecoveryRequiresIntParameter("max_worker_processes",
    6284             :                                      max_worker_processes,
    6285             :                                      ControlFile->max_worker_processes);
    6286          86 :         RecoveryRequiresIntParameter("max_wal_senders",
    6287             :                                      max_wal_senders,
    6288             :                                      ControlFile->max_wal_senders);
    6289          86 :         RecoveryRequiresIntParameter("max_prepared_transactions",
    6290             :                                      max_prepared_xacts,
    6291             :                                      ControlFile->max_prepared_xacts);
    6292          86 :         RecoveryRequiresIntParameter("max_locks_per_transaction",
    6293             :                                      max_locks_per_xact,
    6294             :                                      ControlFile->max_locks_per_xact);
    6295             :     }
    6296         214 : }
    6297             : 
    6298             : /*
    6299             :  * This must be called ONCE during postmaster or standalone-backend startup
    6300             :  */
    6301             : void
    6302        1390 : StartupXLOG(void)
    6303             : {
    6304             :     XLogCtlInsert *Insert;
    6305             :     CheckPoint  checkPoint;
    6306             :     bool        wasShutdown;
    6307        1390 :     bool        reachedRecoveryTarget = false;
    6308        1390 :     bool        haveBackupLabel = false;
    6309        1390 :     bool        haveTblspcMap = false;
    6310             :     XLogRecPtr  RecPtr,
    6311             :                 checkPointLoc,
    6312             :                 EndOfLog;
    6313             :     TimeLineID  EndOfLogTLI;
    6314             :     TimeLineID  PrevTimeLineID;
    6315             :     XLogRecord *record;
    6316             :     TransactionId oldestActiveXID;
    6317        1390 :     bool        backupEndRequired = false;
    6318        1390 :     bool        backupFromStandby = false;
    6319             :     DBState     dbstate_at_startup;
    6320             :     XLogReaderState *xlogreader;
    6321             :     XLogPageReadPrivate private;
    6322        1390 :     bool        fast_promoted = false;
    6323             :     struct stat st;
    6324             : 
    6325             :     /*
    6326             :      * We should have an aux process resource owner to use, and we should not
    6327             :      * be in a transaction that's installed some other resowner.
    6328             :      */
    6329             :     Assert(AuxProcessResourceOwner != NULL);
    6330             :     Assert(CurrentResourceOwner == NULL ||
    6331             :            CurrentResourceOwner == AuxProcessResourceOwner);
    6332        1390 :     CurrentResourceOwner = AuxProcessResourceOwner;
    6333             : 
    6334             :     /*
    6335             :      * Check that contents look valid.
    6336             :      */
    6337        1390 :     if (!XRecOffIsValid(ControlFile->checkPoint))
    6338           0 :         ereport(FATAL,
    6339             :                 (errmsg("control file contains invalid checkpoint location")));
    6340             : 
    6341        1390 :     switch (ControlFile->state)
    6342             :     {
    6343        1198 :         case DB_SHUTDOWNED:
    6344             : 
    6345             :             /*
    6346             :              * This is the expected case, so don't be chatty in standalone
    6347             :              * mode
    6348             :              */
    6349        1198 :             ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    6350             :                     (errmsg("database system was shut down at %s",
    6351             :                             str_time(ControlFile->time))));
    6352        1198 :             break;
    6353             : 
    6354          16 :         case DB_SHUTDOWNED_IN_RECOVERY:
    6355          16 :             ereport(LOG,
    6356             :                     (errmsg("database system was shut down in recovery at %s",
    6357             :                             str_time(ControlFile->time))));
    6358          16 :             break;
    6359             : 
    6360           0 :         case DB_SHUTDOWNING:
    6361           0 :             ereport(LOG,
    6362             :                     (errmsg("database system shutdown was interrupted; last known up at %s",
    6363             :                             str_time(ControlFile->time))));
    6364           0 :             break;
    6365             : 
    6366           0 :         case DB_IN_CRASH_RECOVERY:
    6367           0 :             ereport(LOG,
    6368             :                     (errmsg("database system was interrupted while in recovery at %s",
    6369             :                             str_time(ControlFile->time)),
    6370             :                      errhint("This probably means that some data is corrupted and"
    6371             :                              " you will have to use the last backup for recovery.")));
    6372           0 :             break;
    6373             : 
    6374           2 :         case DB_IN_ARCHIVE_RECOVERY:
    6375           2 :             ereport(LOG,
    6376             :                     (errmsg("database system was interrupted while in recovery at log time %s",
    6377             :                             str_time(ControlFile->checkPointCopy.time)),
    6378             :                      errhint("If this has occurred more than once some data might be corrupted"
    6379             :                              " and you might need to choose an earlier recovery target.")));
    6380           2 :             break;
    6381             : 
    6382         174 :         case DB_IN_PRODUCTION:
    6383         174 :             ereport(LOG,
    6384             :                     (errmsg("database system was interrupted; last known up at %s",
    6385             :                             str_time(ControlFile->time))));
    6386         174 :             break;
    6387             : 
    6388           0 :         default:
    6389           0 :             ereport(FATAL,
    6390             :                     (errmsg("control file contains invalid database cluster state")));
    6391             :     }
    6392             : 
    6393             :     /* This is just to allow attaching to startup process with a debugger */
    6394             : #ifdef XLOG_REPLAY_DELAY
    6395             :     if (ControlFile->state != DB_SHUTDOWNED)
    6396             :         pg_usleep(60000000L);
    6397             : #endif
    6398             : 
    6399             :     /*
    6400             :      * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
    6401             :      * someone has performed a copy for PITR, these directories may have been
    6402             :      * excluded and need to be re-created.
    6403             :      */
    6404        1390 :     ValidateXLOGDirectoryStructure();
    6405             : 
    6406             :     /*----------
    6407             :      * If we previously crashed, perform a couple of actions:
    6408             :      *
    6409             :      * - The pg_wal directory may still include some temporary WAL segments
    6410             :      *   used when creating a new segment, so perform some clean up to not
    6411             :      *   bloat this path.  This is done first as there is no point to sync
    6412             :      *   this temporary data.
    6413             :      *
    6414             :      * - There might be data which we had written, intending to fsync it, but
    6415             :      *   which we had not actually fsync'd yet.  Therefore, a power failure in
    6416             :      *   the near future might cause earlier unflushed writes to be lost, even
    6417             :      *   though more recent data written to disk from here on would be
    6418             :      *   persisted.  To avoid that, fsync the entire data directory.
    6419             :      */
    6420        1390 :     if (ControlFile->state != DB_SHUTDOWNED &&
    6421         192 :         ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
    6422             :     {
    6423         176 :         RemoveTempXlogFiles();
    6424         176 :         SyncDataDirectory();
    6425             :     }
    6426             : 
    6427             :     /*
    6428             :      * Initialize on the assumption we want to recover to the latest timeline
    6429             :      * that's active according to pg_control.
    6430             :      */
    6431        2780 :     if (ControlFile->minRecoveryPointTLI >
    6432        1390 :         ControlFile->checkPointCopy.ThisTimeLineID)
    6433           2 :         recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
    6434             :     else
    6435        1388 :         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
    6436             : 
    6437             :     /*
    6438             :      * Check for signal files, and if so set up state for offline recovery
    6439             :      */
    6440        1390 :     readRecoverySignalFile();
    6441        1390 :     validateRecoveryParameters();
    6442             : 
    6443        1390 :     if (ArchiveRecoveryRequested)
    6444             :     {
    6445          82 :         if (StandbyModeRequested)
    6446          80 :             ereport(LOG,
    6447             :                     (errmsg("entering standby mode")));
    6448           2 :         else if (recoveryTarget == RECOVERY_TARGET_XID)
    6449           0 :             ereport(LOG,
    6450             :                     (errmsg("starting point-in-time recovery to XID %u",
    6451             :                             recoveryTargetXid)));
    6452           2 :         else if (recoveryTarget == RECOVERY_TARGET_TIME)
    6453           0 :             ereport(LOG,
    6454             :                     (errmsg("starting point-in-time recovery to %s",
    6455             :                             timestamptz_to_str(recoveryTargetTime))));
    6456           2 :         else if (recoveryTarget == RECOVERY_TARGET_NAME)
    6457           2 :             ereport(LOG,
    6458             :                     (errmsg("starting point-in-time recovery to \"%s\"",
    6459             :                             recoveryTargetName)));
    6460           0 :         else if (recoveryTarget == RECOVERY_TARGET_LSN)
    6461           0 :             ereport(LOG,
    6462             :                     (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
    6463             :                             (uint32) (recoveryTargetLSN >> 32),
    6464             :                             (uint32) recoveryTargetLSN)));
    6465           0 :         else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
    6466           0 :             ereport(LOG,
    6467             :                     (errmsg("starting point-in-time recovery to earliest consistent point")));
    6468             :         else
    6469           0 :             ereport(LOG,
    6470             :                     (errmsg("starting archive recovery")));
    6471             :     }
    6472             : 
    6473             :     /*
    6474             :      * Take ownership of the wakeup latch if we're going to sleep during
    6475             :      * recovery.
    6476             :      */
    6477        1390 :     if (ArchiveRecoveryRequested)
    6478          82 :         OwnLatch(&XLogCtl->recoveryWakeupLatch);
    6479             : 
    6480             :     /* Set up XLOG reader facility */
    6481        1390 :     MemSet(&private, 0, sizeof(XLogPageReadPrivate));
    6482             :     xlogreader =
    6483        1390 :         XLogReaderAllocate(wal_segment_size, NULL,
    6484        1390 :                            XL_ROUTINE(.page_read = &XLogPageRead,
    6485             :                                       .segment_open = NULL,
    6486             :                                       .segment_close = wal_segment_close),
    6487             :                            &private);
    6488        1390 :     if (!xlogreader)
    6489           0 :         ereport(ERROR,
    6490             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
    6491             :                  errmsg("out of memory"),
    6492             :                  errdetail("Failed while allocating a WAL reading processor.")));
    6493        1390 :     xlogreader->system_identifier = ControlFile->system_identifier;
    6494             : 
    6495             :     /*
    6496             :      * Allocate two page buffers dedicated to WAL consistency checks.  We do
    6497             :      * it this way, rather than just making static arrays, for two reasons:
    6498             :      * (1) no need to waste the storage in most instantiations of the backend;
    6499             :      * (2) a static char array isn't guaranteed to have any particular
    6500             :      * alignment, whereas palloc() will provide MAXALIGN'd storage.
    6501             :      */
    6502        1390 :     replay_image_masked = (char *) palloc(BLCKSZ);
    6503        1390 :     master_image_masked = (char *) palloc(BLCKSZ);
    6504             : 
    6505        1390 :     if (read_backup_label(&checkPointLoc, &backupEndRequired,
    6506             :                           &backupFromStandby))
    6507             :     {
    6508          58 :         List       *tablespaces = NIL;
    6509             : 
    6510             :         /*
    6511             :          * Archive recovery was requested, and thanks to the backup label
    6512             :          * file, we know how far we need to replay to reach consistency. Enter
    6513             :          * archive recovery directly.
    6514             :          */
    6515          58 :         InArchiveRecovery = true;
    6516          58 :         if (StandbyModeRequested)
    6517          56 :             StandbyMode = true;
    6518             : 
    6519             :         /*
    6520             :          * When a backup_label file is present, we want to roll forward from
    6521             :          * the checkpoint it identifies, rather than using pg_control.
    6522             :          */
    6523          58 :         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
    6524          58 :         if (record != NULL)
    6525             :         {
    6526          58 :             memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
    6527          58 :             wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
    6528          58 :             ereport(DEBUG1,
    6529             :                     (errmsg("checkpoint record is at %X/%X",
    6530             :                             (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
    6531          58 :             InRecovery = true;  /* force recovery even if SHUTDOWNED */
    6532             : 
    6533             :             /*
    6534             :              * Make sure that REDO location exists. This may not be the case
    6535             :              * if there was a crash during an online backup, which left a
    6536             :              * backup_label around that references a WAL segment that's
    6537             :              * already been archived.
    6538             :              */
    6539          58 :             if (checkPoint.redo < checkPointLoc)
    6540             :             {
    6541          58 :                 XLogBeginRead(xlogreader, checkPoint.redo);
    6542          58 :                 if (!ReadRecord(xlogreader, LOG, false))
    6543           0 :                     ereport(FATAL,
    6544             :                             (errmsg("could not find redo location referenced by checkpoint record"),
    6545             :                              errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
    6546             :                                      "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
    6547             :                                      "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
    6548             :                                      DataDir, DataDir, DataDir)));
    6549             :             }
    6550             :         }
    6551             :         else
    6552             :         {
    6553           0 :             ereport(FATAL,
    6554             :                     (errmsg("could not locate required checkpoint record"),
    6555             :                      errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
    6556             :                              "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
    6557             :                              "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
    6558             :                              DataDir, DataDir, DataDir)));
    6559             :             wasShutdown = false;    /* keep compiler quiet */
    6560             :         }
    6561             : 
    6562             :         /* read the tablespace_map file if present and create symlinks. */
    6563          58 :         if (read_tablespace_map(&tablespaces))
    6564             :         {
    6565             :             ListCell   *lc;
    6566             : 
    6567           0 :             foreach(lc, tablespaces)
    6568             :             {
    6569           0 :                 tablespaceinfo *ti = lfirst(lc);
    6570             :                 char       *linkloc;
    6571             : 
    6572           0 :                 linkloc = psprintf("pg_tblspc/%s", ti->oid);
    6573             : 
    6574             :                 /*
    6575             :                  * Remove the existing symlink if any and Create the symlink
    6576             :                  * under PGDATA.
    6577             :                  */
    6578           0 :                 remove_tablespace_symlink(linkloc);
    6579             : 
    6580           0 :                 if (symlink(ti->path, linkloc) < 0)
    6581           0 :                     ereport(ERROR,
    6582             :                             (errcode_for_file_access(),
    6583             :                              errmsg("could not create symbolic link \"%s\": %m",
    6584             :                                     linkloc)));
    6585             : 
    6586           0 :                 pfree(ti->oid);
    6587           0 :                 pfree(ti->path);
    6588           0 :                 pfree(ti);
    6589             :             }
    6590             : 
    6591             :             /* set flag to delete it later */
    6592           0 :             haveTblspcMap = true;
    6593             :         }
    6594             : 
    6595             :         /* set flag to delete it later */
    6596          58 :         haveBackupLabel = true;
    6597             :     }
    6598             :     else
    6599             :     {
    6600             :         /*
    6601             :          * If tablespace_map file is present without backup_label file, there
    6602             :          * is no use of such file.  There is no harm in retaining it, but it
    6603             :          * is better to get rid of the map file so that we don't have any
    6604             :          * redundant file in data directory and it will avoid any sort of
    6605             :          * confusion.  It seems prudent though to just rename the file out of
    6606             :          * the way rather than delete it completely, also we ignore any error
    6607             :          * that occurs in rename operation as even if map file is present
    6608             :          * without backup_label file, it is harmless.
    6609             :          */
    6610        1332 :         if (stat(TABLESPACE_MAP, &st) == 0)
    6611             :         {
    6612           0 :             unlink(TABLESPACE_MAP_OLD);
    6613           0 :             if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
    6614           0 :                 ereport(LOG,
    6615             :                         (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
    6616             :                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
    6617             :                          errdetail("File \"%s\" was renamed to \"%s\".",
    6618             :                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
    6619             :             else
    6620           0 :                 ereport(LOG,
    6621             :                         (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
    6622             :                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
    6623             :                          errdetail("Could not rename file \"%s\" to \"%s\": %m.",
    6624             :                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
    6625             :         }
    6626             : 
    6627             :         /*
    6628             :          * It's possible that archive recovery was requested, but we don't
    6629             :          * know how far we need to replay the WAL before we reach consistency.
    6630             :          * This can happen for example if a base backup is taken from a
    6631             :          * running server using an atomic filesystem snapshot, without calling
    6632             :          * pg_start/stop_backup. Or if you just kill a running master server
    6633             :          * and put it into archive recovery by creating a recovery signal
    6634             :          * file.
    6635             :          *
    6636             :          * Our strategy in that case is to perform crash recovery first,
    6637             :          * replaying all the WAL present in pg_wal, and only enter archive
    6638             :          * recovery after that.
    6639             :          *
    6640             :          * But usually we already know how far we need to replay the WAL (up
    6641             :          * to minRecoveryPoint, up to backupEndPoint, or until we see an
    6642             :          * end-of-backup record), and we can enter archive recovery directly.
    6643             :          */
    6644        1332 :         if (ArchiveRecoveryRequested &&
    6645          24 :             (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
    6646           6 :              ControlFile->backupEndRequired ||
    6647           6 :              ControlFile->backupEndPoint != InvalidXLogRecPtr ||
    6648           6 :              ControlFile->state == DB_SHUTDOWNED))
    6649             :         {
    6650          22 :             InArchiveRecovery = true;
    6651          22 :             if (StandbyModeRequested)
    6652          22 :                 StandbyMode = true;
    6653             :         }
    6654             : 
    6655             :         /* Get the last valid checkpoint record. */
    6656        1332 :         checkPointLoc = ControlFile->checkPoint;
    6657        1332 :         RedoStartLSN = ControlFile->checkPointCopy.redo;
    6658        1332 :         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
    6659        1332 :         if (record != NULL)
    6660             :         {
    6661        1332 :             ereport(DEBUG1,
    6662             :                     (errmsg("checkpoint record is at %X/%X",
    6663             :                             (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
    6664             :         }
    6665             :         else
    6666             :         {
    6667             :             /*
    6668             :              * We used to attempt to go back to a secondary checkpoint record
    6669             :              * here, but only when not in standby mode. We now just fail if we
    6670             :              * can't read the last checkpoint because this allows us to
    6671             :              * simplify processing around checkpoints.
    6672             :              */
    6673           0 :             ereport(PANIC,
    6674             :                     (errmsg("could not locate a valid checkpoint record")));
    6675             :         }
    6676        1332 :         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
    6677        1332 :         wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
    6678             :     }
    6679             : 
    6680             :     /*
    6681             :      * Clear out any old relcache cache files.  This is *necessary* if we do
    6682             :      * any WAL replay, since that would probably result in the cache files
    6683             :      * being out of sync with database reality.  In theory we could leave them
    6684             :      * in place if the database had been cleanly shut down, but it seems
    6685             :      * safest to just remove them always and let them be rebuilt during the
    6686             :      * first backend startup.  These files needs to be removed from all
    6687             :      * directories including pg_tblspc, however the symlinks are created only
    6688             :      * after reading tablespace_map file in case of archive recovery from
    6689             :      * backup, so needs to clear old relcache files here after creating
    6690             :      * symlinks.
    6691             :      */
    6692        1390 :     RelationCacheInitFileRemove();
    6693             : 
    6694             :     /*
    6695             :      * If the location of the checkpoint record is not on the expected
    6696             :      * timeline in the history of the requested timeline, we cannot proceed:
    6697             :      * the backup is not part of the history of the requested timeline.
    6698             :      */
    6699             :     Assert(expectedTLEs);       /* was initialized by reading checkpoint
    6700             :                                  * record */
    6701        1390 :     if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
    6702        1390 :         checkPoint.ThisTimeLineID)
    6703             :     {
    6704             :         XLogRecPtr  switchpoint;
    6705             : 
    6706             :         /*
    6707             :          * tliSwitchPoint will throw an error if the checkpoint's timeline is
    6708             :          * not in expectedTLEs at all.
    6709             :          */
    6710           0 :         switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
    6711           0 :         ereport(FATAL,
    6712             :                 (errmsg("requested timeline %u is not a child of this server's history",
    6713             :                         recoveryTargetTLI),
    6714             :                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
    6715             :                            (uint32) (ControlFile->checkPoint >> 32),
    6716             :                            (uint32) ControlFile->checkPoint,
    6717             :                            ControlFile->checkPointCopy.ThisTimeLineID,
    6718             :                            (uint32) (switchpoint >> 32),
    6719             :                            (uint32) switchpoint)));
    6720             :     }
    6721             : 
    6722             :     /*
    6723             :      * The min recovery point should be part of the requested timeline's
    6724             :      * history, too.
    6725             :      */
    6726        1390 :     if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
    6727          20 :         tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
    6728          20 :         ControlFile->minRecoveryPointTLI)
    6729           0 :         ereport(FATAL,
    6730             :                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
    6731             :                         recoveryTargetTLI,
    6732             :                         (uint32) (ControlFile->minRecoveryPoint >> 32),
    6733             :                         (uint32) ControlFile->minRecoveryPoint,
    6734             :                         ControlFile->minRecoveryPointTLI)));
    6735             : 
    6736        1390 :     LastRec = RecPtr = checkPointLoc;
    6737             : 
    6738        1390 :     ereport(DEBUG1,
    6739             :             (errmsg_internal("redo record is at %X/%X; shutdown %s",
    6740             :                              (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
    6741             :                              wasShutdown ? "true" : "false")));
    6742        1390 :     ereport(DEBUG1,
    6743             :             (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
    6744             :                              U64FromFullTransactionId(checkPoint.nextFullXid),
    6745             :                              checkPoint.nextOid)));
    6746        1390 :     ereport(DEBUG1,
    6747             :             (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
    6748             :                              checkPoint.nextMulti, checkPoint.nextMultiOffset)));
    6749        1390 :     ereport(DEBUG1,
    6750             :             (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
    6751             :                              checkPoint.oldestXid, checkPoint.oldestXidDB)));
    6752        1390 :     ereport(DEBUG1,
    6753             :             (errmsg_internal("oldest MultiXactId: %u, in database %u",
    6754             :                              checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
    6755        1390 :     ereport(DEBUG1,
    6756             :             (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
    6757             :                              checkPoint.oldestCommitTsXid,
    6758             :                              checkPoint.newestCommitTsXid)));
    6759        1390 :     if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextFullXid)))
    6760           0 :         ereport(PANIC,
    6761             :                 (errmsg("invalid next transaction ID")));
    6762             : 
    6763             :     /* initialize shared memory variables from the checkpoint record */
    6764        1390 :     ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
    6765        1390 :     ShmemVariableCache->nextOid = checkPoint.nextOid;
    6766        1390 :     ShmemVariableCache->oidCount = 0;
    6767        1390 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    6768        1390 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    6769        1390 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    6770        1390 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
    6771        1390 :     SetCommitTsLimit(checkPoint.oldestCommitTsXid,
    6772             :                      checkPoint.newestCommitTsXid);
    6773        1390 :     XLogCtl->ckptFullXid = checkPoint.nextFullXid;
    6774             : 
    6775             :     /*
    6776             :      * Initialize replication slots, before there's a chance to remove
    6777             :      * required resources.
    6778             :      */
    6779        1390 :     StartupReplicationSlots();
    6780             : 
    6781             :     /*
    6782             :      * Startup logical state, needs to be setup now so we have proper data
    6783             :      * during crash recovery.
    6784             :      */
    6785        1390 :     StartupReorderBuffer();
    6786             : 
    6787             :     /*
    6788             :      * Startup MultiXact. We need to do this early to be able to replay
    6789             :      * truncations.
    6790             :      */
    6791        1390 :     StartupMultiXact();
    6792             : 
    6793             :     /*
    6794             :      * Ditto for commit timestamps.  Activate the facility if the setting is
    6795             :      * enabled in the control file, as there should be no tracking of commit
    6796             :      * timestamps done when the setting was disabled.  This facility can be
    6797             :      * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
    6798             :      */
    6799        1390 :     if (ControlFile->track_commit_timestamp)
    6800          16 :         StartupCommitTs();
    6801             : 
    6802             :     /*
    6803             :      * Recover knowledge about replay progress of known replication partners.
    6804             :      */
    6805        1390 :     StartupReplicationOrigin();
    6806             : 
    6807             :     /*
    6808             :      * Initialize unlogged LSN. On a clean shutdown, it's restored from the
    6809             :      * control file. On recovery, all unlogged relations are blown away, so
    6810             :      * the unlogged LSN counter can be reset too.
    6811             :      */
    6812        1390 :     if (ControlFile->state == DB_SHUTDOWNED)
    6813        1198 :         XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
    6814             :     else
    6815         192 :         XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
    6816             : 
    6817             :     /*
    6818             :      * We must replay WAL entries using the same TimeLineID they were created
    6819             :      * under, so temporarily adopt the TLI indicated by the checkpoint (see
    6820             :      * also xlog_redo()).
    6821             :      */
    6822        1390 :     ThisTimeLineID = checkPoint.ThisTimeLineID;
    6823             : 
    6824             :     /*
    6825             :      * Copy any missing timeline history files between 'now' and the recovery
    6826             :      * target timeline from archive to pg_wal. While we don't need those files
    6827             :      * ourselves - the history file of the recovery target timeline covers all
    6828             :      * the previous timelines in the history too - a cascading standby server
    6829             :      * might be interested in them. Or, if you archive the WAL from this
    6830             :      * server to a different archive than the master, it'd be good for all the
    6831             :      * history files to get archived there after failover, so that you can use
    6832             :      * one of the old timelines as a PITR target. Timeline history files are
    6833             :      * small, so it's better to copy them unnecessarily than not copy them and
    6834             :      * regret later.
    6835             :      */
    6836        1390 :     restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
    6837             : 
    6838             :     /*
    6839             :      * Before running in recovery, scan pg_twophase and fill in its status to
    6840             :      * be able to work on entries generated by redo.  Doing a scan before
    6841             :      * taking any recovery action has the merit to discard any 2PC files that
    6842             :      * are newer than the first record to replay, saving from any conflicts at
    6843             :      * replay.  This avoids as well any subsequent scans when doing recovery
    6844             :      * of the on-disk two-phase data.
    6845             :      */
    6846        1390 :     restoreTwoPhaseData();
    6847             : 
    6848        1390 :     lastFullPageWrites = checkPoint.fullPageWrites;
    6849             : 
    6850        1390 :     RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    6851        1390 :     doPageWrites = lastFullPageWrites;
    6852             : 
    6853        1390 :     if (RecPtr < checkPoint.redo)
    6854           0 :         ereport(PANIC,
    6855             :                 (errmsg("invalid redo in checkpoint record")));
    6856             : 
    6857             :     /*
    6858             :      * Check whether we need to force recovery from WAL.  If it appears to
    6859             :      * have been a clean shutdown and we did not have a recovery signal file,
    6860             :      * then assume no recovery needed.
    6861             :      */
    6862        1390 :     if (checkPoint.redo < RecPtr)
    6863             :     {
    6864         100 :         if (wasShutdown)
    6865           0 :             ereport(PANIC,
    6866             :                     (errmsg("invalid redo record in shutdown checkpoint")));
    6867         100 :         InRecovery = true;
    6868             :     }
    6869        1290 :     else if (ControlFile->state != DB_SHUTDOWNED)
    6870          92 :         InRecovery = true;
    6871        1198 :     else if (ArchiveRecoveryRequested)
    6872             :     {
    6873             :         /* force recovery due to presence of recovery signal file */
    6874           4 :         InRecovery = true;
    6875             :     }
    6876             : 
    6877             :     /* REDO */
    6878        1390 :     if (InRecovery)
    6879             :     {
    6880             :         int         rmid;
    6881             : 
    6882             :         /*
    6883             :          * Update pg_control to show that we are recovering and to show the
    6884             :          * selected checkpoint as the place we are starting from. We also mark
    6885             :          * pg_control with any minimum recovery stop point obtained from a
    6886             :          * backup history file.
    6887             :          */
    6888         196 :         dbstate_at_startup = ControlFile->state;
    6889         196 :         if (InArchiveRecovery)
    6890             :         {
    6891          80 :             ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
    6892             : 
    6893          80 :             SpinLockAcquire(&XLogCtl->info_lck);
    6894          80 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
    6895          80 :             SpinLockRelease(&XLogCtl->info_lck);
    6896             :         }
    6897             :         else
    6898             :         {
    6899         116 :             ereport(LOG,
    6900             :                     (errmsg("database system was not properly shut down; "
    6901             :                             "automatic recovery in progress")));
    6902         116 :             if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
    6903           2 :                 ereport(LOG,
    6904             :                         (errmsg("crash recovery starts in timeline %u "
    6905             :                                 "and has target timeline %u",
    6906             :                                 ControlFile->checkPointCopy.ThisTimeLineID,
    6907             :                                 recoveryTargetTLI)));
    6908         116 :             ControlFile->state = DB_IN_CRASH_RECOVERY;
    6909             : 
    6910         116 :             SpinLockAcquire(&XLogCtl->info_lck);
    6911         116 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
    6912         116 :             SpinLockRelease(&XLogCtl->info_lck);
    6913             :         }
    6914         196 :         ControlFile->checkPoint = checkPointLoc;
    6915         196 :         ControlFile->checkPointCopy = checkPoint;
    6916         196 :         if (InArchiveRecovery)
    6917             :         {
    6918             :             /* initialize minRecoveryPoint if not set yet */
    6919          80 :             if (ControlFile->minRecoveryPoint < checkPoint.redo)
    6920             :             {
    6921          62 :                 ControlFile->minRecoveryPoint = checkPoint.redo;
    6922          62 :                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
    6923             :             }
    6924             :         }
    6925             : 
    6926             :         /*
    6927             :          * Set backupStartPoint if we're starting recovery from a base backup.
    6928             :          *
    6929             :          * Also set backupEndPoint and use minRecoveryPoint as the backup end
    6930             :          * location if we're starting recovery from a base backup which was
    6931             :          * taken from a standby. In this case, the database system status in
    6932             :          * pg_control must indicate that the database was already in recovery.
    6933             :          * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
    6934             :          * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
    6935             :          * before reaching this point; e.g. because restore_command or
    6936             :          * primary_conninfo were faulty.
    6937             :          *
    6938             :          * Any other state indicates that the backup somehow became corrupted
    6939             :          * and we can't sensibly continue with recovery.
    6940             :          */
    6941         196 :         if (haveBackupLabel)
    6942             :         {
    6943          58 :             ControlFile->backupStartPoint = checkPoint.redo;
    6944          58 :             ControlFile->backupEndRequired = backupEndRequired;
    6945             : 
    6946          58 :             if (backupFromStandby)
    6947             :             {
    6948           0 :                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
    6949             :                     dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
    6950           0 :                     ereport(FATAL,
    6951             :                             (errmsg("backup_label contains data inconsistent with control file"),
    6952             :                              errhint("This means that the backup is corrupted and you will "
    6953             :                                      "have to use another backup for recovery.")));
    6954           0 :                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
    6955             :             }
    6956             :         }
    6957         196 :         ControlFile->time = (pg_time_t) time(NULL);
    6958             :         /* No need to hold ControlFileLock yet, we aren't up far enough */
    6959         196 :         UpdateControlFile();
    6960             : 
    6961             :         /*
    6962             :          * Initialize our local copy of minRecoveryPoint.  When doing crash
    6963             :          * recovery we want to replay up to the end of WAL.  Particularly, in
    6964             :          * the case of a promoted standby minRecoveryPoint value in the
    6965             :          * control file is only updated after the first checkpoint.  However,
    6966             :          * if the instance crashes before the first post-recovery checkpoint
    6967             :          * is completed then recovery will use a stale location causing the
    6968             :          * startup process to think that there are still invalid page
    6969             :          * references when checking for data consistency.
    6970             :          */
    6971         196 :         if (InArchiveRecovery)
    6972             :         {
    6973          80 :             minRecoveryPoint = ControlFile->minRecoveryPoint;
    6974          80 :             minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    6975             :         }
    6976             :         else
    6977             :         {
    6978         116 :             minRecoveryPoint = InvalidXLogRecPtr;
    6979         116 :             minRecoveryPointTLI = 0;
    6980             :         }
    6981             : 
    6982             :         /*
    6983             :          * Reset pgstat data, because it may be invalid after recovery.
    6984             :          */
    6985         196 :         pgstat_reset_all();
    6986             : 
    6987             :         /*
    6988             :          * If there was a backup label file, it's done its job and the info
    6989             :          * has now been propagated into pg_control.  We must get rid of the
    6990             :          * label file so that if we crash during recovery, we'll pick up at
    6991             :          * the latest recovery restartpoint instead of going all the way back
    6992             :          * to the backup start point.  It seems prudent though to just rename
    6993             :          * the file out of the way rather than delete it completely.
    6994             :          */
    6995         196 :         if (haveBackupLabel)
    6996             :         {
    6997          58 :             unlink(BACKUP_LABEL_OLD);
    6998          58 :             durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
    6999             :         }
    7000             : 
    7001             :         /*
    7002             :          * If there was a tablespace_map file, it's done its job and the
    7003             :          * symlinks have been created.  We must get rid of the map file so
    7004             :          * that if we crash during recovery, we don't create symlinks again.
    7005             :          * It seems prudent though to just rename the file out of the way
    7006             :          * rather than delete it completely.
    7007             :          */
    7008         196 :         if (haveTblspcMap)
    7009             :         {
    7010           0 :             unlink(TABLESPACE_MAP_OLD);
    7011           0 :             durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
    7012             :         }
    7013             : 
    7014             :         /* Check that the GUCs used to generate the WAL allow recovery */
    7015         196 :         CheckRequiredParameterValues();
    7016             : 
    7017             :         /*
    7018             :          * We're in recovery, so unlogged relations may be trashed and must be
    7019             :          * reset.  This should be done BEFORE allowing Hot Standby
    7020             :          * connections, so that read-only backends don't try to read whatever
    7021             :          * garbage is left over from before.
    7022             :          */
    7023         196 :         ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
    7024             : 
    7025             :         /*
    7026             :          * Likewise, delete any saved transaction snapshot files that got left
    7027             :          * behind by crashed backends.
    7028             :          */
    7029         196 :         DeleteAllExportedSnapshotFiles();
    7030             : 
    7031             :         /*
    7032             :          * Initialize for Hot Standby, if enabled. We won't let backends in
    7033             :          * yet, not until we've reached the min recovery point specified in
    7034             :          * control file and we've established a recovery snapshot from a
    7035             :          * running-xacts WAL record.
    7036             :          */
    7037         196 :         if (ArchiveRecoveryRequested && EnableHotStandby)
    7038             :         {
    7039             :             TransactionId *xids;
    7040             :             int         nxids;
    7041             : 
    7042          82 :             ereport(DEBUG1,
    7043             :                     (errmsg("initializing for hot standby")));
    7044             : 
    7045          82 :             InitRecoveryTransactionEnvironment();
    7046             : 
    7047          82 :             if (wasShutdown)
    7048          12 :                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    7049             :             else
    7050          70 :                 oldestActiveXID = checkPoint.oldestActiveXid;
    7051             :             Assert(TransactionIdIsValid(oldestActiveXID));
    7052             : 
    7053             :             /* Tell procarray about the range of xids it has to deal with */
    7054          82 :             ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextFullXid));
    7055             : 
    7056             :             /*
    7057             :              * Startup commit log and subtrans only.  MultiXact and commit
    7058             :              * timestamp have already been started up and other SLRUs are not
    7059             :              * maintained during recovery and need not be started yet.
    7060             :              */
    7061          82 :             StartupCLOG();
    7062          82 :             StartupSUBTRANS(oldestActiveXID);
    7063             : 
    7064             :             /*
    7065             :              * If we're beginning at a shutdown checkpoint, we know that
    7066             :              * nothing was running on the master at this point. So fake-up an
    7067             :              * empty running-xacts record and use that here and now. Recover
    7068             :              * additional standby state for prepared transactions.
    7069             :              */
    7070          82 :             if (wasShutdown)
    7071             :             {
    7072             :                 RunningTransactionsData running;
    7073             :                 TransactionId latestCompletedXid;
    7074             : 
    7075             :                 /*
    7076             :                  * Construct a RunningTransactions snapshot representing a
    7077             :                  * shut down server, with only prepared transactions still
    7078             :                  * alive. We're never overflowed at this point because all
    7079             :                  * subxids are listed with their parent prepared transactions.
    7080             :                  */
    7081          12 :                 running.xcnt = nxids;
    7082          12 :                 running.subxcnt = 0;
    7083          12 :                 running.subxid_overflow = false;
    7084          12 :                 running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
    7085          12 :                 running.oldestRunningXid = oldestActiveXID;
    7086          12 :                 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
    7087          12 :                 TransactionIdRetreat(latestCompletedXid);
    7088             :                 Assert(TransactionIdIsNormal(latestCompletedXid));
    7089          12 :                 running.latestCompletedXid = latestCompletedXid;
    7090          12 :                 running.xids = xids;
    7091             : 
    7092          12 :                 ProcArrayApplyRecoveryInfo(&running);
    7093             : 
    7094          12 :                 StandbyRecoverPreparedTransactions();
    7095             :             }
    7096             :         }
    7097             : 
    7098             :         /* Initialize resource managers */
    7099        4508 :         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    7100             :         {
    7101        4312 :             if (RmgrTable[rmid].rm_startup != NULL)
    7102         784 :                 RmgrTable[rmid].rm_startup();
    7103             :         }
    7104             : 
    7105             :         /*
    7106             :          * Initialize shared variables for tracking progress of WAL replay, as
    7107             :          * if we had just replayed the record before the REDO location (or the
    7108             :          * checkpoint record itself, if it's a shutdown checkpoint).
    7109             :          */
    7110         196 :         SpinLockAcquire(&XLogCtl->info_lck);
    7111         196 :         if (checkPoint.redo < RecPtr)
    7112         100 :             XLogCtl->replayEndRecPtr = checkPoint.redo;
    7113             :         else
    7114          96 :             XLogCtl->replayEndRecPtr = EndRecPtr;
    7115         196 :         XLogCtl->replayEndTLI = ThisTimeLineID;
    7116         196 :         XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
    7117         196 :         XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
    7118         196 :         XLogCtl->recoveryLastXTime = 0;
    7119         196 :         XLogCtl->currentChunkStartTime = 0;
    7120         196 :         XLogCtl->recoveryPause = false;
    7121         196 :         SpinLockRelease(&XLogCtl->info_lck);
    7122             : 
    7123             :         /* Also ensure XLogReceiptTime has a sane value */
    7124         196 :         XLogReceiptTime = GetCurrentTimestamp();
    7125             : 
    7126             :         /*
    7127             :          * Let postmaster know we've started redo now, so that it can launch
    7128             :          * checkpointer to perform restartpoints.  We don't bother during
    7129             :          * crash recovery as restartpoints can only be performed during
    7130             :          * archive recovery.  And we'd like to keep crash recovery simple, to
    7131             :          * avoid introducing bugs that could affect you when recovering after
    7132             :          * crash.
    7133             :          *
    7134             :          * After this point, we can no longer assume that we're the only
    7135             :          * process in addition to postmaster!  Also, fsync requests are
    7136             :          * subsequently to be handled by the checkpointer, not locally.
    7137             :          */
    7138         196 :         if (ArchiveRecoveryRequested && IsUnderPostmaster)
    7139             :         {
    7140          82 :             PublishStartupProcessInformation();
    7141          82 :             EnableSyncRequestForwarding();
    7142          82 :             SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
    7143          82 :             bgwriterLaunched = true;
    7144             :         }
    7145             : 
    7146             :         /*
    7147             :          * Allow read-only connections immediately if we're consistent
    7148             :          * already.
    7149             :          */
    7150         196 :         CheckRecoveryConsistency();
    7151             : 
    7152             :         /*
    7153             :          * Find the first record that logically follows the checkpoint --- it
    7154             :          * might physically precede it, though.
    7155             :          */
    7156         196 :         if (checkPoint.redo < RecPtr)
    7157             :         {
    7158             :             /* back up to find the record */
    7159         100 :             XLogBeginRead(xlogreader, checkPoint.redo);
    7160         100 :             record = ReadRecord(xlogreader, PANIC, false);
    7161             :         }
    7162             :         else
    7163             :         {
    7164             :             /* just have to read next record after CheckPoint */
    7165          96 :             record = ReadRecord(xlogreader, LOG, false);
    7166             :         }
    7167             : 
    7168         196 :         if (record != NULL)
    7169             :         {
    7170             :             ErrorContextCallback errcallback;
    7171             :             TimestampTz xtime;
    7172             : 
    7173         188 :             InRedo = true;
    7174             : 
    7175         188 :             ereport(LOG,
    7176             :                     (errmsg("redo starts at %X/%X",
    7177             :                             (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
    7178             : 
    7179             :             /*
    7180             :              * main redo apply loop
    7181             :              */
    7182             :             do
    7183             :             {
    7184      573454 :                 bool        switchedTLI = false;
    7185             : 
    7186             : #ifdef WAL_DEBUG
    7187             :                 if (XLOG_DEBUG ||
    7188             :                     (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
    7189             :                     (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
    7190             :                 {
    7191             :                     StringInfoData buf;
    7192             : 
    7193             :                     initStringInfo(&buf);
    7194             :                     appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
    7195             :                                      (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
    7196             :                                      (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
    7197             :                     xlog_outrec(&buf, xlogreader);
    7198             :                     appendStringInfoString(&buf, " - ");
    7199             :                     xlog_outdesc(&buf, xlogreader);
    7200             :                     elog(LOG, "%s", buf.data);
    7201             :                     pfree(buf.data);
    7202             :                 }
    7203             : #endif
    7204             : 
    7205             :                 /* Handle interrupt signals of startup process */
    7206      573454 :                 HandleStartupProcInterrupts();
    7207             : 
    7208             :                 /*
    7209             :                  * Pause WAL replay, if requested by a hot-standby session via
    7210             :                  * SetRecoveryPause().
    7211             :                  *
    7212             :                  * Note that we intentionally don't take the info_lck spinlock
    7213             :                  * here.  We might therefore read a slightly stale value of
    7214             :                  * the recoveryPause flag, but it can't be very stale (no
    7215             :                  * worse than the last spinlock we did acquire).  Since a
    7216             :                  * pause request is a pretty asynchronous thing anyway,
    7217             :                  * possibly responding to it one WAL record later than we
    7218             :                  * otherwise would is a minor issue, so it doesn't seem worth
    7219             :                  * adding another spinlock cycle to prevent that.
    7220             :                  */
    7221      573454 :                 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
    7222           0 :                     recoveryPausesHere(false);
    7223             : 
    7224             :                 /*
    7225             :                  * Have we reached our recovery target?
    7226             :                  */
    7227      573454 :                 if (recoveryStopsBefore(xlogreader))
    7228             :                 {
    7229           0 :                     reachedRecoveryTarget = true;
    7230           0 :                     break;
    7231             :                 }
    7232             : 
    7233             :                 /*
    7234             :                  * If we've been asked to lag the master, wait on latch until
    7235             :                  * enough time has passed.
    7236             :                  */
    7237      573454 :                 if (recoveryApplyDelay(xlogreader))
    7238             :                 {
    7239             :                     /*
    7240             :                      * We test for paused recovery again here. If user sets
    7241             :                      * delayed apply, it may be because they expect to pause
    7242             :                      * recovery in case of problems, so we must test again
    7243             :                      * here otherwise pausing during the delay-wait wouldn't
    7244             :                      * work.
    7245             :                      */
    7246           0 :                     if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
    7247           0 :                         recoveryPausesHere(false);
    7248             :                 }
    7249             : 
    7250             :                 /* Setup error traceback support for ereport() */
    7251      573454 :                 errcallback.callback = rm_redo_error_callback;
    7252      573454 :                 errcallback.arg = (void *) xlogreader;
    7253      573454 :                 errcallback.previous = error_context_stack;
    7254      573454 :                 error_context_stack = &errcallback;
    7255             : 
    7256             :                 /*
    7257             :                  * ShmemVariableCache->nextFullXid must be beyond record's
    7258             :                  * xid.
    7259             :                  */
    7260      573454 :                 AdvanceNextFullTransactionIdPastXid(record->xl_xid);
    7261             : 
    7262             :                 /*
    7263             :                  * Before replaying this record, check if this record causes
    7264             :                  * the current timeline to change. The record is already
    7265             :                  * considered to be part of the new timeline, so we update
    7266             :                  * ThisTimeLineID before replaying it. That's important so
    7267             :                  * that replayEndTLI, which is recorded as the minimum
    7268             :                  * recovery point's TLI if recovery stops after this record,
    7269             :                  * is set correctly.
    7270             :                  */
    7271      573454 :                 if (record->xl_rmid == RM_XLOG_ID)
    7272             :                 {
    7273        3040 :                     TimeLineID  newTLI = ThisTimeLineID;
    7274        3040 :                     TimeLineID  prevTLI = ThisTimeLineID;
    7275        3040 :                     uint8       info = record->xl_info & ~XLR_INFO_MASK;
    7276             : 
    7277        3040 :                     if (info == XLOG_CHECKPOINT_SHUTDOWN)
    7278             :                     {
    7279             :                         CheckPoint  checkPoint;
    7280             : 
    7281          30 :                         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
    7282          30 :                         newTLI = checkPoint.ThisTimeLineID;
    7283          30 :                         prevTLI = checkPoint.PrevTimeLineID;
    7284             :                     }
    7285        3010 :                     else if (info == XLOG_END_OF_RECOVERY)
    7286             :                     {
    7287             :                         xl_end_of_recovery xlrec;
    7288             : 
    7289          10 :                         memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
    7290          10 :                         newTLI = xlrec.ThisTimeLineID;
    7291          10 :                         prevTLI = xlrec.PrevTimeLineID;
    7292             :                     }
    7293             : 
    7294        3040 :                     if (newTLI != ThisTimeLineID)
    7295             :                     {
    7296             :                         /* Check that it's OK to switch to this TLI */
    7297          10 :                         checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
    7298             : 
    7299             :                         /* Following WAL records should be run with new TLI */
    7300          10 :                         ThisTimeLineID = newTLI;
    7301          10 :                         switchedTLI = true;
    7302             :                     }
    7303             :                 }
    7304             : 
    7305             :                 /*
    7306             :                  * Update shared replayEndRecPtr before replaying this record,
    7307             :                  * so that XLogFlush will update minRecoveryPoint correctly.
    7308             :                  */
    7309      573454 :                 SpinLockAcquire(&XLogCtl->info_lck);
    7310      573454 :                 XLogCtl->replayEndRecPtr = EndRecPtr;
    7311      573454 :                 XLogCtl->replayEndTLI = ThisTimeLineID;
    7312      573454 :                 SpinLockRelease(&XLogCtl->info_lck);
    7313             : 
    7314             :                 /*
    7315             :                  * If we are attempting to enter Hot Standby mode, process
    7316             :                  * XIDs we see
    7317             :                  */
    7318      573454 :                 if (standbyState >= STANDBY_INITIALIZED &&
    7319      202592 :                     TransactionIdIsValid(record->xl_xid))
    7320      201302 :                     RecordKnownAssignedTransactionIds(record->xl_xid);
    7321             : 
    7322             :                 /* Now apply the WAL record itself */
    7323      573454 :                 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
    7324             : 
    7325             :                 /*
    7326             :                  * After redo, check whether the backup pages associated with
    7327             :                  * the WAL record are consistent with the existing pages. This
    7328             :                  * check is done only if consistency check is enabled for this
    7329             :                  * record.
    7330             :                  */
    7331      573454 :                 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
    7332           0 :                     checkXLogConsistency(xlogreader);
    7333             : 
    7334             :                 /* Pop the error context stack */
    7335      573454 :                 error_context_stack = errcallback.previous;
    7336             : 
    7337             :                 /*
    7338             :                  * Update lastReplayedEndRecPtr after this record has been
    7339             :                  * successfully replayed.
    7340             :                  */
    7341      573454 :                 SpinLockAcquire(&XLogCtl->info_lck);
    7342      573454 :                 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
    7343      573454 :                 XLogCtl->lastReplayedTLI = ThisTimeLineID;
    7344      573454 :                 SpinLockRelease(&XLogCtl->info_lck);
    7345             : 
    7346             :                 /*
    7347             :                  * If rm_redo called XLogRequestWalReceiverReply, then we wake
    7348             :                  * up the receiver so that it notices the updated
    7349             :                  * lastReplayedEndRecPtr and sends a reply to the master.
    7350             :                  */
    7351      573454 :                 if (doRequestWalReceiverReply)
    7352             :                 {
    7353           0 :                     doRequestWalReceiverReply = false;
    7354           0 :                     WalRcvForceReply();
    7355             :                 }
    7356             : 
    7357             :                 /* Remember this record as the last-applied one */
    7358      573454 :                 LastRec = ReadRecPtr;
    7359             : 
    7360             :                 /* Allow read-only connections if we're consistent now */
    7361      573454 :                 CheckRecoveryConsistency();
    7362             : 
    7363             :                 /* Is this a timeline switch? */
    7364      573454 :                 if (switchedTLI)
    7365             :                 {
    7366             :                     /*
    7367             :                      * Before we continue on the new timeline, clean up any
    7368             :                      * (possibly bogus) future WAL segments on the old
    7369             :                      * timeline.
    7370             :                      */
    7371          10 :                     RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
    7372             : 
    7373             :                     /*
    7374             :                      * Wake up any walsenders to notice that we are on a new
    7375             :                      * timeline.
    7376             :                      */
    7377          10 :                     if (switchedTLI && AllowCascadeReplication())
    7378          10 :                         WalSndWakeup();
    7379             :                 }
    7380             : 
    7381             :                 /* Exit loop if we reached inclusive recovery target */
    7382      573454 :                 if (recoveryStopsAfter(xlogreader))
    7383             :                 {
    7384           0 :                     reachedRecoveryTarget = true;
    7385           0 :                     break;
    7386             :                 }
    7387             : 
    7388             :                 /* Else, try to fetch the next WAL record */
    7389      573454 :                 record = ReadRecord(xlogreader, LOG, false);
    7390      573422 :             } while (record != NULL);
    7391             : 
    7392             :             /*
    7393             :              * end of main redo apply loop
    7394             :              */
    7395             : 
    7396         156 :             if (reachedRecoveryTarget)
    7397             :             {
    7398           0 :                 if (!reachedConsistency)
    7399           0 :                     ereport(FATAL,
    7400             :                             (errmsg("requested recovery stop point is before consistent recovery point")));
    7401             : 
    7402             :                 /*
    7403             :                  * This is the last point where we can restart recovery with a
    7404             :                  * new recovery target, if we shutdown and begin again. After
    7405             :                  * this, Resource Managers may choose to do permanent
    7406             :                  * corrective actions at end of recovery.
    7407             :                  */
    7408           0 :                 switch (recoveryTargetAction)
    7409             :                 {
    7410           0 :                     case RECOVERY_TARGET_ACTION_SHUTDOWN:
    7411             : 
    7412             :                         /*
    7413             :                          * exit with special return code to request shutdown
    7414             :                          * of postmaster.  Log messages issued from
    7415             :                          * postmaster.
    7416             :                          */
    7417           0 :                         proc_exit(3);
    7418             : 
    7419           0 :                     case RECOVERY_TARGET_ACTION_PAUSE:
    7420           0 :                         SetRecoveryPause(true);
    7421           0 :                         recoveryPausesHere(true);
    7422             : 
    7423             :                         /* drop into promote */
    7424             : 
    7425           0 :                     case RECOVERY_TARGET_ACTION_PROMOTE:
    7426           0 :                         break;
    7427             :                 }
    7428         156 :             }
    7429             : 
    7430             :             /* Allow resource managers to do any required cleanup. */
    7431        3588 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    7432             :             {
    7433        3432 :                 if (RmgrTable[rmid].rm_cleanup != NULL)
    7434         624 :                     RmgrTable[rmid].rm_cleanup();
    7435             :             }
    7436             : 
    7437         156 :             ereport(LOG,
    7438             :                     (errmsg("redo done at %X/%X",
    7439             :                             (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
    7440         156 :             xtime = GetLatestXTime();
    7441         156 :             if (xtime)
    7442          38 :                 ereport(LOG,
    7443             :                         (errmsg("last completed transaction was at log time %s",
    7444             :                                 timestamptz_to_str(xtime))));
    7445             : 
    7446         156 :             InRedo = false;
    7447             :         }
    7448             :         else
    7449             :         {
    7450             :             /* there are no WAL records following the checkpoint */
    7451           8 :             ereport(LOG,
    7452             :                     (errmsg("redo is not required")));
    7453             : 
    7454             :         }
    7455             : 
    7456             :         /*
    7457             :          * This check is intentionally after the above log messages that
    7458             :          * indicate how far recovery went.
    7459             :          */
    7460         164 :         if (ArchiveRecoveryRequested &&
    7461          50 :             recoveryTarget != RECOVERY_TARGET_UNSET &&
    7462           2 :             !reachedRecoveryTarget)
    7463           2 :             ereport(FATAL,
    7464             :                     (errmsg("recovery ended before configured recovery target was reached")));
    7465             :     }
    7466             : 
    7467             :     /*
    7468             :      * Kill WAL receiver, if it's still running, before we continue to write
    7469             :      * the startup checkpoint record. It will trump over the checkpoint and
    7470             :      * subsequent records if it's still alive when we start writing WAL.
    7471             :      */
    7472        1356 :     ShutdownWalRcv();
    7473             : 
    7474             :     /*
    7475             :      * Reset unlogged relations to the contents of their INIT fork. This is
    7476             :      * done AFTER recovery is complete so as to include any unlogged relations
    7477             :      * created during recovery, but BEFORE recovery is marked as having
    7478             :      * completed successfully. Otherwise we'd not retry if any of the post
    7479             :      * end-of-recovery steps fail.
    7480             :      */
    7481        1356 :     if (InRecovery)
    7482         162 :         ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
    7483             : 
    7484             :     /*
    7485             :      * We don't need the latch anymore. It's not strictly necessary to disown
    7486             :      * it, but let's do it for the sake of tidiness.
    7487             :      */
    7488        1356 :     if (ArchiveRecoveryRequested)
    7489          48 :         DisownLatch(&XLogCtl->recoveryWakeupLatch);
    7490             : 
    7491             :     /*
    7492             :      * We are now done reading the xlog from stream. Turn off streaming
    7493             :      * recovery to force fetching the files (which would be required at end of
    7494             :      * recovery, e.g., timeline history file) from archive or pg_wal.
    7495             :      *
    7496             :      * Note that standby mode must be turned off after killing WAL receiver,
    7497             :      * i.e., calling ShutdownWalRcv().
    7498             :      */
    7499             :     Assert(!WalRcvStreaming());
    7500        1356 :     StandbyMode = false;
    7501             : 
    7502             :     /*
    7503             :      * Re-fetch the last valid or last applied record, so we can identify the
    7504             :      * exact endpoint of what we consider the valid portion of WAL.
    7505             :      */
    7506        1356 :     XLogBeginRead(xlogreader, LastRec);
    7507        1356 :     record = ReadRecord(xlogreader, PANIC, false);
    7508        1356 :     EndOfLog = EndRecPtr;
    7509             : 
    7510             :     /*
    7511             :      * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
    7512             :      * the end-of-log. It could be different from the timeline that EndOfLog
    7513             :      * nominally belongs to, if there was a timeline switch in that segment,
    7514             :      * and we were reading the old WAL from a segment belonging to a higher
    7515             :      * timeline.
    7516             :      */
    7517        1356 :     EndOfLogTLI = xlogreader->seg.ws_tli;
    7518             : 
    7519             :     /*
    7520             :      * Complain if we did not roll forward far enough to render the backup
    7521             :      * dump consistent.  Note: it is indeed okay to look at the local variable
    7522             :      * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
    7523             :      * be further ahead --- ControlFile->minRecoveryPoint cannot have been
    7524             :      * advanced beyond the WAL we processed.
    7525             :      */
    7526        1356 :     if (InRecovery &&
    7527         162 :         (EndOfLog < minRecoveryPoint ||
    7528         162 :          !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
    7529             :     {
    7530             :         /*
    7531             :          * Ran off end of WAL before reaching end-of-backup WAL record, or
    7532             :          * minRecoveryPoint. That's usually a bad sign, indicating that you
    7533             :          * tried to recover from an online backup but never called
    7534             :          * pg_stop_backup(), or you didn't archive all the WAL up to that
    7535             :          * point. However, this also happens in crash recovery, if the system
    7536             :          * crashes while an online backup is in progress. We must not treat
    7537             :          * that as an error, or the database will refuse to start up.
    7538             :          */
    7539           0 :         if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
    7540             :         {
    7541           0 :             if (ControlFile->backupEndRequired)
    7542           0 :                 ereport(FATAL,
    7543             :                         (errmsg("WAL ends before end of online backup"),
    7544             :                          errhint("All WAL generated while online backup was taken must be available at recovery.")));
    7545           0 :             else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
    7546           0 :                 ereport(FATAL,
    7547             :                         (errmsg("WAL ends before end of online backup"),
    7548             :                          errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
    7549             :             else
    7550           0 :                 ereport(FATAL,
    7551             :                         (errmsg("WAL ends before consistent recovery point")));
    7552             :         }
    7553             :     }
    7554             : 
    7555             :     /*
    7556             :      * Pre-scan prepared transactions to find out the range of XIDs present.
    7557             :      * This information is not quite needed yet, but it is positioned here so
    7558             :      * as potential problems are detected before any on-disk change is done.
    7559             :      */
    7560        1356 :     oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
    7561             : 
    7562             :     /*
    7563             :      * Consider whether we need to assign a new timeline ID.
    7564             :      *
    7565             :      * If we are doing an archive recovery, we always assign a new ID.  This
    7566             :      * handles a couple of issues.  If we stopped short of the end of WAL
    7567             :      * during recovery, then we are clearly generating a new timeline and must
    7568             :      * assign it a unique new ID.  Even if we ran to the end, modifying the
    7569             :      * current last segment is problematic because it may result in trying to
    7570             :      * overwrite an already-archived copy of that segment, and we encourage
    7571             :      * DBAs to make their archive_commands reject that.  We can dodge the
    7572             :      * problem by making the new active segment have a new timeline ID.
    7573             :      *
    7574             :      * In a normal crash recovery, we can just extend the timeline we were in.
    7575             :      */
    7576        1356 :     PrevTimeLineID = ThisTimeLineID;
    7577        1356 :     if (ArchiveRecoveryRequested)
    7578             :     {
    7579             :         char        reason[200];
    7580             :         char        recoveryPath[MAXPGPATH];
    7581             : 
    7582             :         Assert(InArchiveRecovery);
    7583             : 
    7584          48 :         ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
    7585          48 :         ereport(LOG,
    7586             :                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
    7587             : 
    7588             :         /*
    7589             :          * Create a comment for the history file to explain why and where
    7590             :          * timeline changed.
    7591             :          */
    7592          48 :         if (recoveryTarget == RECOVERY_TARGET_XID)
    7593           0 :             snprintf(reason, sizeof(reason),
    7594             :                      "%s transaction %u",
    7595           0 :                      recoveryStopAfter ? "after" : "before",
    7596             :                      recoveryStopXid);
    7597          48 :         else if (recoveryTarget == RECOVERY_TARGET_TIME)
    7598           0 :             snprintf(reason, sizeof(reason),
    7599             :                      "%s %s\n",
    7600           0 :                      recoveryStopAfter ? "after" : "before",
    7601             :                      timestamptz_to_str(recoveryStopTime));
    7602          48 :         else if (recoveryTarget == RECOVERY_TARGET_LSN)
    7603           0 :             snprintf(reason, sizeof(reason),
    7604             :                      "%s LSN %X/%X\n",
    7605           0 :                      recoveryStopAfter ? "after" : "before",
    7606           0 :                      (uint32) (recoveryStopLSN >> 32),
    7607             :                      (uint32) recoveryStopLSN);
    7608          48 :         else if (recoveryTarget == RECOVERY_TARGET_NAME)
    7609           0 :             snprintf(reason, sizeof(reason),
    7610             :                      "at restore point \"%s\"",
    7611             :                      recoveryStopName);
    7612          48 :         else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
    7613           0 :             snprintf(reason, sizeof(reason), "reached consistency");
    7614             :         else
    7615          48 :             snprintf(reason, sizeof(reason), "no recovery target specified");
    7616             : 
    7617             :         /*
    7618             :          * We are now done reading the old WAL.  Turn off archive fetching if
    7619             :          * it was active, and make a writable copy of the last WAL segment.
    7620             :          * (Note that we also have a copy of the last block of the old WAL in
    7621             :          * readBuf; we will use that below.)
    7622             :          */
    7623          48 :         exitArchiveRecovery(EndOfLogTLI, EndOfLog);
    7624             : 
    7625             :         /*
    7626             :          * Write the timeline history file, and have it archived. After this
    7627             :          * point (or rather, as soon as the file is archived), the timeline
    7628             :          * will appear as "taken" in the WAL archive and to any standby
    7629             :          * servers.  If we crash before actually switching to the new
    7630             :          * timeline, standby servers will nevertheless think that we switched
    7631             :          * to the new timeline, and will try to connect to the new timeline.
    7632             :          * To minimize the window for that, try to do as little as possible
    7633             :          * between here and writing the end-of-recovery record.
    7634             :          */
    7635          48 :         writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
    7636             :                              EndRecPtr, reason);
    7637             : 
    7638             :         /*
    7639             :          * Since there might be a partial WAL segment named RECOVERYXLOG, get
    7640             :          * rid of it.
    7641             :          */
    7642          48 :         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
    7643          48 :         unlink(recoveryPath);   /* ignore any error */
    7644             : 
    7645             :         /* Get rid of any remaining recovered timeline-history file, too */
    7646          48 :         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
    7647          48 :         unlink(recoveryPath);   /* ignore any error */
    7648             :     }
    7649             : 
    7650             :     /* Save the selected TimeLineID in shared memory, too */
    7651        1356 :     XLogCtl->ThisTimeLineID = ThisTimeLineID;
    7652        1356 :     XLogCtl->PrevTimeLineID = PrevTimeLineID;
    7653             : 
    7654             :     /*
    7655             :      * Prepare to write WAL starting at EndOfLog location, and init xlog
    7656             :      * buffer cache using the block containing the last record from the
    7657             :      * previous incarnation.
    7658             :      */
    7659        1356 :     Insert = &XLogCtl->Insert;
    7660        1356 :     Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
    7661        1356 :     Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
    7662             : 
    7663             :     /*
    7664             :      * Tricky point here: readBuf contains the *last* block that the LastRec
    7665             :      * record spans, not the one it starts in.  The last block is indeed the
    7666             :      * one we want to use.
    7667             :      */
    7668        1356 :     if (EndOfLog % XLOG_BLCKSZ != 0)
    7669             :     {
    7670             :         char       *page;
    7671             :         int         len;
    7672             :         int         firstIdx;
    7673             :         XLogRecPtr  pageBeginPtr;
    7674             : 
    7675        1338 :         pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
    7676             :         Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
    7677             : 
    7678        1338 :         firstIdx = XLogRecPtrToBufIdx(EndOfLog);
    7679             : 
    7680             :         /* Copy the valid part of the last block, and zero the rest */
    7681        1338 :         page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
    7682        1338 :         len = EndOfLog % XLOG_BLCKSZ;
    7683        1338 :         memcpy(page, xlogreader->readBuf, len);
    7684        1338 :         memset(page + len, 0, XLOG_BLCKSZ - len);
    7685             : 
    7686        1338 :         XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
    7687        1338 :         XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
    7688             :     }
    7689             :     else
    7690             :     {
    7691             :         /*
    7692             :          * There is no partial block to copy. Just set InitializedUpTo, and
    7693             :          * let the first attempt to insert a log record to initialize the next
    7694             :          * buffer.
    7695             :          */
    7696          18 :         XLogCtl->InitializedUpTo = EndOfLog;
    7697             :     }
    7698             : 
    7699        1356 :     LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
    7700             : 
    7701        1356 :     XLogCtl->LogwrtResult = LogwrtResult;
    7702             : 
    7703        1356 :     XLogCtl->LogwrtRqst.Write = EndOfLog;
    7704        1356 :     XLogCtl->LogwrtRqst.Flush = EndOfLog;
    7705             : 
    7706             :     /*
    7707             :      * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
    7708             :      * record before resource manager writes cleanup WAL records or checkpoint
    7709             :      * record is written.
    7710             :      */
    7711        1356 :     Insert->fullPageWrites = lastFullPageWrites;
    7712        1356 :     LocalSetXLogInsertAllowed();
    7713        1356 :     UpdateFullPageWrites();
    7714        1356 :     LocalXLogInsertAllowed = -1;
    7715             : 
    7716        1356 :     if (InRecovery)
    7717             :     {
    7718             :         /*
    7719             :          * Perform a checkpoint to update all our recovery activity to disk.
    7720             :          *
    7721             :          * Note that we write a shutdown checkpoint rather than an on-line
    7722             :          * one. This is not particularly critical, but since we may be
    7723             :          * assigning a new TLI, using a shutdown checkpoint allows us to have
    7724             :          * the rule that TLI only changes in shutdown checkpoints, which
    7725             :          * allows some extra error checking in xlog_redo.
    7726             :          *
    7727             :          * In fast promotion, only create a lightweight end-of-recovery record
    7728             :          * instead of a full checkpoint. A checkpoint is requested later,
    7729             :          * after we're fully out of recovery mode and already accepting
    7730             :          * queries.
    7731             :          */
    7732         162 :         if (bgwriterLaunched)
    7733             :         {
    7734          48 :             if (fast_promote)
    7735             :             {
    7736          48 :                 checkPointLoc = ControlFile->checkPoint;
    7737             : 
    7738             :                 /*
    7739             :                  * Confirm the last checkpoint is available for us to recover
    7740             :                  * from if we fail.
    7741             :                  */
    7742          48 :                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
    7743          48 :                 if (record != NULL)
    7744             :                 {
    7745          48 :                     fast_promoted = true;
    7746             : 
    7747             :                     /*
    7748             :                      * Insert a special WAL record to mark the end of
    7749             :                      * recovery, since we aren't doing a checkpoint. That
    7750             :                      * means that the checkpointer process may likely be in
    7751             :                      * the middle of a time-smoothed restartpoint and could
    7752             :                      * continue to be for minutes after this. That sounds
    7753             :                      * strange, but the effect is roughly the same and it
    7754             :                      * would be stranger to try to come out of the
    7755             :                      * restartpoint and then checkpoint. We request a
    7756             :                      * checkpoint later anyway, just for safety.
    7757             :                      */
    7758          48 :                     CreateEndOfRecoveryRecord();
    7759             :                 }
    7760             :             }
    7761             : 
    7762          48 :             if (!fast_promoted)
    7763           0 :                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
    7764             :                                   CHECKPOINT_IMMEDIATE |
    7765             :                                   CHECKPOINT_WAIT);
    7766             :         }
    7767             :         else
    7768         114 :             CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
    7769             :     }
    7770             : 
    7771        1356 :     if (ArchiveRecoveryRequested)
    7772             :     {
    7773             :         /*
    7774             :          * And finally, execute the recovery_end_command, if any.
    7775             :          */
    7776          48 :         if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
    7777           0 :             ExecuteRecoveryCommand(recoveryEndCommand,
    7778             :                                    "recovery_end_command",
    7779             :                                    true);
    7780             : 
    7781             :         /*
    7782             :          * We switched to a new timeline. Clean up segments on the old
    7783             :          * timeline.
    7784             :          *
    7785             :          * If there are any higher-numbered segments on the old timeline,
    7786             :          * remove them. They might contain valid WAL, but they might also be
    7787             :          * pre-allocated files containing garbage. In any case, they are not
    7788             :          * part of the new timeline's history so we don't need them.
    7789             :          */
    7790          48 :         RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
    7791             : 
    7792             :         /*
    7793             :          * If the switch happened in the middle of a segment, what to do with
    7794             :          * the last, partial segment on the old timeline? If we don't archive
    7795             :          * it, and the server that created the WAL never archives it either
    7796             :          * (e.g. because it was hit by a meteor), it will never make it to the
    7797             :          * archive. That's OK from our point of view, because the new segment
    7798             :          * that we created with the new TLI contains all the WAL from the old
    7799             :          * timeline up to the switch point. But if you later try to do PITR to
    7800             :          * the "missing" WAL on the old timeline, recovery won't find it in
    7801             :          * the archive. It's physically present in the new file with new TLI,
    7802             :          * but recovery won't look there when it's recovering to the older
    7803             :          * timeline. On the other hand, if we archive the partial segment, and
    7804             :          * the original server on that timeline is still running and archives
    7805             :          * the completed version of the same segment later, it will fail. (We
    7806             :          * used to do that in 9.4 and below, and it caused such problems).
    7807             :          *
    7808             :          * As a compromise, we rename the last segment with the .partial
    7809             :          * suffix, and archive it. Archive recovery will never try to read
    7810             :          * .partial segments, so they will normally go unused. But in the odd
    7811             :          * PITR case, the administrator can copy them manually to the pg_wal
    7812             :          * directory (removing the suffix). They can be useful in debugging,
    7813             :          * too.
    7814             :          *
    7815             :          * If a .done or .ready file already exists for the old timeline,
    7816             :          * however, we had already determined that the segment is complete, so
    7817             :          * we can let it be archived normally. (In particular, if it was
    7818             :          * restored from the archive to begin with, it's expected to have a
    7819             :          * .done file).
    7820             :          */
    7821          48 :         if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
    7822             :             XLogArchivingActive())
    7823             :         {
    7824             :             char        origfname[MAXFNAMELEN];
    7825             :             XLogSegNo   endLogSegNo;
    7826             : 
    7827           2 :             XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
    7828           2 :             XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
    7829             : 
    7830           2 :             if (!XLogArchiveIsReadyOrDone(origfname))
    7831             :             {
    7832             :                 char        origpath[MAXPGPATH];
    7833             :                 char        partialfname[MAXFNAMELEN];
    7834             :                 char        partialpath[MAXPGPATH];
    7835             : 
    7836           2 :                 XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
    7837           2 :                 snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
    7838           2 :                 snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
    7839             : 
    7840             :                 /*
    7841             :                  * Make sure there's no .done or .ready file for the .partial
    7842             :                  * file.
    7843             :                  */
    7844           2 :                 XLogArchiveCleanup(partialfname);
    7845             : 
    7846           2 :                 durable_rename(origpath, partialpath, ERROR);
    7847           2 :                 XLogArchiveNotify(partialfname);
    7848             :             }
    7849             :         }
    7850             :     }
    7851             : 
    7852             :     /*
    7853             :      * Preallocate additional log files, if wanted.
    7854             :      */
    7855        1356 :     PreallocXlogFiles(EndOfLog);
    7856             : 
    7857             :     /*
    7858             :      * Okay, we're officially UP.
    7859             :      */
    7860        1356 :     InRecovery = false;
    7861             : 
    7862             :     /* start the archive_timeout timer and LSN running */
    7863        1356 :     XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    7864        1356 :     XLogCtl->lastSegSwitchLSN = EndOfLog;
    7865             : 
    7866             :     /* also initialize latestCompletedXid, to nextXid - 1 */
    7867        1356 :     LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
    7868        1356 :     ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
    7869        3504 :     TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
    7870        1356 :     LWLockRelease(ProcArrayLock);
    7871             : 
    7872             :     /*
    7873             :      * Start up the commit log and subtrans, if not already done for hot
    7874             :      * standby.  (commit timestamps are started below, if necessary.)
    7875             :      */
    7876        1356 :     if (standbyState == STANDBY_DISABLED)
    7877             :     {
    7878        1308 :         StartupCLOG();
    7879        1308 :         StartupSUBTRANS(oldestActiveXID);
    7880             :     }
    7881             : 
    7882             :     /*
    7883             :      * Perform end of recovery actions for any SLRUs that need it.
    7884             :      */
    7885        1356 :     TrimCLOG();
    7886        1356 :     TrimMultiXact();
    7887             : 
    7888             :     /* Reload shared-memory state for prepared transactions */
    7889        1356 :     RecoverPreparedTransactions();
    7890             : 
    7891             :     /*
    7892             :      * Shutdown the recovery environment. This must occur after
    7893             :      * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
    7894             :      */
    7895        1356 :     if (standbyState != STANDBY_DISABLED)
    7896          48 :         ShutdownRecoveryTransactionEnvironment();
    7897             : 
    7898             :     /* Shut down xlogreader */
    7899        1356 :     if (readFile >= 0)
    7900             :     {
    7901        1342 :         close(readFile);
    7902        1342 :         readFile = -1;
    7903             :     }
    7904        1356 :     XLogReaderFree(xlogreader);
    7905             : 
    7906             :     /*
    7907             :      * If any of the critical GUCs have changed, log them before we allow
    7908             :      * backends to write WAL.
    7909             :      */
    7910        1356 :     LocalSetXLogInsertAllowed();
    7911        1356 :     XLogReportParameters();
    7912             : 
    7913             :     /*
    7914             :      * Local WAL inserts enabled, so it's time to finish initialization of
    7915             :      * commit timestamp.
    7916             :      */
    7917        1356 :     CompleteCommitTsInitialization();
    7918             : 
    7919             :     /*
    7920             :      * All done with end-of-recovery actions.
    7921             :      *
    7922             :      * Now allow backends to write WAL and update the control file status in
    7923             :      * consequence.  The boolean flag allowing backends to write WAL is
    7924             :      * updated while holding ControlFileLock to prevent other backends to look
    7925             :      * at an inconsistent state of the control file in shared memory.  There
    7926             :      * is still a small window during which backends can write WAL and the
    7927             :      * control file is still referring to a system not in DB_IN_PRODUCTION
    7928             :      * state while looking at the on-disk control file.
    7929             :      *
    7930             :      * Also, although the boolean flag to allow WAL is probably atomic in
    7931             :      * itself, we use the info_lck here to ensure that there are no race
    7932             :      * conditions concerning visibility of other recent updates to shared
    7933             :      * memory.
    7934             :      */
    7935        1356 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7936        1356 :     ControlFile->state = DB_IN_PRODUCTION;
    7937        1356 :     ControlFile->time = (pg_time_t) time(NULL);
    7938             : 
    7939        1356 :     SpinLockAcquire(&XLogCtl->info_lck);
    7940        1356 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
    7941        1356 :     SpinLockRelease(&XLogCtl->info_lck);
    7942             : 
    7943        1356 :     UpdateControlFile();
    7944        1356 :     LWLockRelease(ControlFileLock);
    7945             : 
    7946             :     /*
    7947             :      * If there were cascading standby servers connected to us, nudge any wal
    7948             :      * sender processes to notice that we've been promoted.
    7949             :      */
    7950        1356 :     WalSndWakeup();
    7951             : 
    7952             :     /*
    7953             :      * If this was a fast promotion, request an (online) checkpoint now. This
    7954             :      * isn't required for consistency, but the last restartpoint might be far
    7955             :      * back, and in case of a crash, recovering from it might take a longer
    7956             :      * than is appropriate now that we're not in standby mode anymore.
    7957             :      */
    7958        1356 :     if (fast_promoted)
    7959          48 :         RequestCheckpoint(CHECKPOINT_FORCE);
    7960        1356 : }
    7961             : 
    7962             : /*
    7963             :  * Checks if recovery has reached a consistent state. When consistency is
    7964             :  * reached and we have a valid starting standby snapshot, tell postmaster
    7965             :  * that it can start accepting read-only connections.
    7966             :  */
    7967             : static void
    7968      573652 : CheckRecoveryConsistency(void)
    7969             : {
    7970             :     XLogRecPtr  lastReplayedEndRecPtr;
    7971             : 
    7972             :     /*
    7973             :      * During crash recovery, we don't reach a consistent state until we've
    7974             :      * replayed all the WAL.
    7975             :      */
    7976      573652 :     if (XLogRecPtrIsInvalid(minRecoveryPoint))
    7977      371008 :         return;
    7978             : 
    7979             :     Assert(InArchiveRecovery);
    7980             : 
    7981             :     /*
    7982             :      * assume that we are called in the startup process, and hence don't need
    7983             :      * a lock to read lastReplayedEndRecPtr
    7984             :      */
    7985      202644 :     lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
    7986             : 
    7987             :     /*
    7988             :      * Have we reached the point where our base backup was completed?
    7989             :      */
    7990      202644 :     if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
    7991           0 :         ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
    7992             :     {
    7993             :         /*
    7994             :          * We have reached the end of base backup, as indicated by pg_control.
    7995             :          * The data on disk is now consistent. Reset backupStartPoint and
    7996             :          * backupEndPoint, and update minRecoveryPoint to make sure we don't
    7997             :          * allow starting up at an earlier point even if recovery is stopped
    7998             :          * and restarted soon after this.
    7999             :          */
    8000           0 :         elog(DEBUG1, "end of backup reached");
    8001             : 
    8002           0 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8003             : 
    8004           0 :         if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
    8005           0 :             ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
    8006             : 
    8007           0 :         ControlFile->backupStartPoint = InvalidXLogRecPtr;
    8008           0 :         ControlFile->backupEndPoint = InvalidXLogRecPtr;
    8009           0 :         ControlFile->backupEndRequired = false;
    8010           0 :         UpdateControlFile();
    8011             : 
    8012           0 :         LWLockRelease(ControlFileLock);
    8013             :     }
    8014             : 
    8015             :     /*
    8016             :      * Have we passed our safe starting point? Note that minRecoveryPoint is
    8017             :      * known to be incorrectly set if ControlFile->backupEndRequired, until
    8018             :      * the XLOG_BACKUP_END arrives to advise us of the correct
    8019             :      * minRecoveryPoint. All we know prior to that is that we're not
    8020             :      * consistent yet.
    8021             :      */
    8022      202644 :     if (!reachedConsistency && !ControlFile->backupEndRequired &&
    8023         236 :         minRecoveryPoint <= lastReplayedEndRecPtr &&
    8024          88 :         XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
    8025             :     {
    8026             :         /*
    8027             :          * Check to see if the XLOG sequence contained any unresolved
    8028             :          * references to uninitialized pages.
    8029             :          */
    8030          82 :         XLogCheckInvalidPages();
    8031             : 
    8032          82 :         reachedConsistency = true;
    8033          82 :         ereport(LOG,
    8034             :                 (errmsg("consistent recovery state reached at %X/%X",
    8035             :                         (uint32) (lastReplayedEndRecPtr >> 32),
    8036             :                         (uint32) lastReplayedEndRecPtr)));
    8037             :     }
    8038             : 
    8039             :     /*
    8040             :      * Have we got a valid starting snapshot that will allow queries to be
    8041             :      * run? If so, we can tell postmaster that the database is consistent now,
    8042             :      * enabling connections.
    8043             :      */
    8044      202644 :     if (standbyState == STANDBY_SNAPSHOT_READY &&
    8045      202460 :         !LocalHotStandbyActive &&
    8046          82 :         reachedConsistency &&
    8047             :         IsUnderPostmaster)
    8048             :     {
    8049          82 :         SpinLockAcquire(&XLogCtl->info_lck);
    8050          82 :         XLogCtl->SharedHotStandbyActive = true;
    8051          82 :         SpinLockRelease(&XLogCtl->info_lck);
    8052             : 
    8053          82 :         LocalHotStandbyActive = true;
    8054             : 
    8055          82 :         SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
    8056             :     }
    8057             : }
    8058             : 
    8059             : /*
    8060             :  * Is the system still in recovery?
    8061             :  *
    8062             :  * Unlike testing InRecovery, this works in any process that's connected to
    8063             :  * shared memory.
    8064             :  *
    8065             :  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
    8066             :  * variables the first time we see that recovery is finished.
    8067             :  */
    8068             : bool
    8069    73156092 : RecoveryInProgress(void)
    8070             : {
    8071             :     /*
    8072             :      * We check shared state each time only until we leave recovery mode. We
    8073             :      * can't re-enter recovery, so there's no need to keep checking after the
    8074             :      * shared variable has once been seen false.
    8075             :      */
    8076    73156092 :     if (!LocalRecoveryInProgress)
    8077    72675956 :         return false;
    8078             :     else
    8079             :     {
    8080             :         /*
    8081             :          * use volatile pointer to make sure we make a fresh read of the
    8082             :          * shared variable.
    8083             :          */
    8084      480136 :         volatile XLogCtlData *xlogctl = XLogCtl;
    8085             : 
    8086      480136 :         LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
    8087             : 
    8088             :         /*
    8089             :          * Initialize TimeLineID and RedoRecPtr when we discover that recovery
    8090             :          * is finished. InitPostgres() relies upon this behaviour to ensure
    8091             :          * that InitXLOGAccess() is called at backend startup.  (If you change
    8092             :          * this, see also LocalSetXLogInsertAllowed.)
    8093             :          */
    8094      480136 :         if (!LocalRecoveryInProgress)
    8095             :         {
    8096             :             /*
    8097             :              * If we just exited recovery, make sure we read TimeLineID and
    8098             :              * RedoRecPtr after SharedRecoveryState (for machines with weak
    8099             :              * memory ordering).
    8100             :              */
    8101       11796 :             pg_memory_barrier();
    8102       11796 :             InitXLOGAccess();
    8103             :         }
    8104             : 
    8105             :         /*
    8106             :          * Note: We don't need a memory barrier when we're still in recovery.
    8107             :          * We might exit recovery immediately after return, so the caller
    8108             :          * can't rely on 'true' meaning that we're still in recovery anyway.
    8109             :          */
    8110             : 
    8111      480136 :         return LocalRecoveryInProgress;
    8112             :     }
    8113             : }
    8114             : 
    8115             : /*
    8116             :  * Returns current recovery state from shared memory.
    8117             :  *
    8118             :  * This returned state is kept consistent with the contents of the control
    8119             :  * file.  See details about the possible values of RecoveryState in xlog.h.
    8120             :  */
    8121             : RecoveryState
    8122           8 : GetRecoveryState(void)
    8123             : {
    8124             :     RecoveryState retval;
    8125             : 
    8126           8 :     SpinLockAcquire(&XLogCtl->info_lck);
    8127           8 :     retval = XLogCtl->SharedRecoveryState;
    8128           8 :     SpinLockRelease(&XLogCtl->info_lck);
    8129             : 
    8130           8 :     return retval;
    8131             : }
    8132             : 
    8133             : /*
    8134             :  * Is HotStandby active yet? This is only important in special backends
    8135             :  * since normal backends won't ever be able to connect until this returns
    8136             :  * true. Postmaster knows this by way of signal, not via shared memory.
    8137             :  *
    8138             :  * Unlike testing standbyState, this works in any process that's connected to
    8139             :  * shared memory.  (And note that standbyState alone doesn't tell the truth
    8140             :  * anyway.)
    8141             :  */
    8142             : bool
    8143         142 : HotStandbyActive(void)
    8144             : {
    8145             :     /*
    8146             :      * We check shared state each time only until Hot Standby is active. We
    8147             :      * can't de-activate Hot Standby, so there's no need to keep checking
    8148             :      * after the shared variable has once been seen true.
    8149             :      */
    8150         142 :     if (LocalHotStandbyActive)
    8151          22 :         return true;
    8152             :     else
    8153             :     {
    8154             :         /* spinlock is essential on machines with weak memory ordering! */
    8155         120 :         SpinLockAcquire(&XLogCtl->info_lck);
    8156         120 :         LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
    8157         120 :         SpinLockRelease(&XLogCtl->info_lck);
    8158             : 
    8159         120 :         return LocalHotStandbyActive;
    8160             :     }
    8161             : }
    8162             : 
    8163             : /*
    8164             :  * Like HotStandbyActive(), but to be used only in WAL replay code,
    8165             :  * where we don't need to ask any other process what the state is.
    8166             :  */
    8167             : bool
    8168           0 : HotStandbyActiveInReplay(void)
    8169             : {
    8170             :     Assert(AmStartupProcess() || !IsPostmasterEnvironment);
    8171           0 :     return LocalHotStandbyActive;
    8172             : }
    8173             : 
    8174             : /*
    8175             :  * Is this process allowed to insert new WAL records?
    8176             :  *
    8177             :  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
    8178             :  * But we also have provisions for forcing the result "true" or "false"
    8179             :  * within specific processes regardless of the global state.
    8180             :  */
    8181             : bool
    8182    63851768 : XLogInsertAllowed(void)
    8183             : {
    8184             :     /*
    8185             :      * If value is "unconditionally true" or "unconditionally false", just
    8186             :      * return it.  This provides the normal fast path once recovery is known
    8187             :      * done.
    8188             :      */
    8189    63851768 :     if (LocalXLogInsertAllowed >= 0)
    8190    63841844 :         return (bool) LocalXLogInsertAllowed;
    8191             : 
    8192             :     /*
    8193             :      * Else, must check to see if we're still in recovery.
    8194             :      */
    8195        9924 :     if (RecoveryInProgress())
    8196        4094 :         return false;
    8197             : 
    8198             :     /*
    8199             :      * On exit from recovery, reset to "unconditionally true", since there is
    8200             :      * no need to keep checking.
    8201             :      */
    8202        5830 :     LocalXLogInsertAllowed = 1;
    8203        5830 :     return true;
    8204             : }
    8205             : 
    8206             : /*
    8207             :  * Make XLogInsertAllowed() return true in the current process only.
    8208             :  *
    8209             :  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
    8210             :  * and even call LocalSetXLogInsertAllowed() again after that.
    8211             :  */
    8212             : static void
    8213        2874 : LocalSetXLogInsertAllowed(void)
    8214             : {
    8215             :     Assert(LocalXLogInsertAllowed == -1);
    8216        2874 :     LocalXLogInsertAllowed = 1;
    8217             : 
    8218             :     /* Initialize as RecoveryInProgress() would do when switching state */
    8219        2874 :     InitXLOGAccess();
    8220        2874 : }
    8221             : 
    8222             : /*
    8223             :  * Subroutine to try to fetch and validate a prior checkpoint record.
    8224             :  *
    8225             :  * whichChkpt identifies the checkpoint (merely for reporting purposes).
    8226             :  * 1 for "primary", 0 for "other" (backup_label)
    8227             :  */
    8228             : static XLogRecord *
    8229        1438 : ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
    8230             :                      int whichChkpt, bool report)
    8231             : {
    8232             :     XLogRecord *record;
    8233             :     uint8       info;
    8234             : 
    8235        1438 :     if (!XRecOffIsValid(RecPtr))
    8236             :     {
    8237           0 :         if (!report)
    8238           0 :             return NULL;
    8239             : 
    8240           0 :         switch (whichChkpt)
    8241             :         {
    8242           0 :             case 1:
    8243           0 :                 ereport(LOG,
    8244             :                         (errmsg("invalid primary checkpoint link in control file")));
    8245           0 :                 break;
    8246           0 :             default:
    8247           0 :                 ereport(LOG,
    8248             :                         (errmsg("invalid checkpoint link in backup_label file")));
    8249           0 :                 break;
    8250             :         }
    8251           0 :         return NULL;
    8252             :     }
    8253             : 
    8254        1438 :     XLogBeginRead(xlogreader, RecPtr);
    8255        1438 :     record = ReadRecord(xlogreader, LOG, true);
    8256             : 
    8257        1438 :     if (record == NULL)
    8258             :     {
    8259           0 :         if (!report)
    8260           0 :             return NULL;
    8261             : 
    8262           0 :         switch (whichChkpt)
    8263             :         {
    8264           0 :             case 1:
    8265           0 :                 ereport(LOG,
    8266             :                         (errmsg("invalid primary checkpoint record")));
    8267           0 :                 break;
    8268           0 :             default:
    8269           0 :                 ereport(LOG,
    8270             :                         (errmsg("invalid checkpoint record")));
    8271           0 :                 break;
    8272             :         }
    8273           0 :         return NULL;
    8274             :     }
    8275        1438 :     if (record->xl_rmid != RM_XLOG_ID)
    8276             :     {
    8277           0 :         switch (whichChkpt)
    8278             :         {
    8279           0 :             case 1:
    8280           0 :                 ereport(LOG,
    8281             :                         (errmsg("invalid resource manager ID in primary checkpoint record")));
    8282           0 :                 break;
    8283           0 :             default:
    8284           0 :                 ereport(LOG,
    8285             :                         (errmsg("invalid resource manager ID in checkpoint record")));
    8286           0 :                 break;
    8287             :         }
    8288           0 :         return NULL;
    8289             :     }
    8290        1438 :     info = record->xl_info & ~XLR_INFO_MASK;
    8291        1438 :     if (info != XLOG_CHECKPOINT_SHUTDOWN &&
    8292             :         info != XLOG_CHECKPOINT_ONLINE)
    8293             :     {
    8294           0 :         switch (whichChkpt)
    8295             :         {
    8296           0 :             case 1:
    8297           0 :                 ereport(LOG,
    8298             :                         (errmsg("invalid xl_info in primary checkpoint record")));
    8299           0 :                 break;
    8300           0 :             default:
    8301           0 :                 ereport(LOG,
    8302             :                         (errmsg("invalid xl_info in checkpoint record")));
    8303           0 :                 break;
    8304             :         }
    8305           0 :         return NULL;
    8306             :     }
    8307        1438 :     if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
    8308             :     {
    8309           0 :         switch (whichChkpt)
    8310             :         {
    8311           0 :             case 1:
    8312           0 :                 ereport(LOG,
    8313             :                         (errmsg("invalid length of primary checkpoint record")));
    8314           0 :                 break;
    8315           0 :             default:
    8316           0 :                 ereport(LOG,
    8317             :                         (errmsg("invalid length of checkpoint record")));
    8318           0 :                 break;
    8319             :         }
    8320           0 :         return NULL;
    8321             :     }
    8322        1438 :     return record;
    8323             : }
    8324             : 
    8325             : /*
    8326             :  * This must be called in a backend process before creating WAL records
    8327             :  * (except in a standalone backend, which does StartupXLOG instead).  We need
    8328             :  * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
    8329             :  *
    8330             :  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
    8331             :  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
    8332             :  * unnecessary however, since the postmaster itself never touches XLOG anyway.
    8333             :  */
    8334             : void
    8335       15036 : InitXLOGAccess(void)
    8336             : {
    8337       15036 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    8338             : 
    8339             :     /* ThisTimeLineID doesn't change so we need no lock to copy it */
    8340       15036 :     ThisTimeLineID = XLogCtl->ThisTimeLineID;
    8341             :     Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
    8342             : 
    8343             :     /* set wal_segment_size */
    8344       15036 :     wal_segment_size = ControlFile->xlog_seg_size;
    8345             : 
    8346             :     /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
    8347       15036 :     (void) GetRedoRecPtr();
    8348             :     /* Also update our copy of doPageWrites. */
    8349       15036 :     doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
    8350             : 
    8351             :     /* Also initialize the working areas for constructing WAL records */
    8352       15036 :     InitXLogInsert();
    8353       15036 : }
    8354             : 
    8355             : /*
    8356             :  * Return the current Redo pointer from shared memory.
    8357             :  *
    8358             :  * As a side-effect, the local RedoRecPtr copy is updated.
    8359             :  */
    8360             : XLogRecPtr
    8361      165034 : GetRedoRecPtr(void)
    8362             : {
    8363             :     XLogRecPtr  ptr;
    8364             : 
    8365             :     /*
    8366             :      * The possibly not up-to-date copy in XlogCtl is enough. Even if we
    8367             :      * grabbed a WAL insertion lock to read the master copy, someone might
    8368             :      * update it just after we've released the lock.
    8369             :      */
    8370      165034 :     SpinLockAcquire(&XLogCtl->info_lck);
    8371      165034 :     ptr = XLogCtl->RedoRecPtr;
    8372      165034 :     SpinLockRelease(&XLogCtl->info_lck);
    8373             : 
    8374      165034 :     if (RedoRecPtr < ptr)
    8375       11086 :         RedoRecPtr = ptr;
    8376             : 
    8377      165034 :     return RedoRecPtr;
    8378             : }
    8379             : 
    8380             : /*
    8381             :  * Return information needed to decide whether a modified block needs a
    8382             :  * full-page image to be included in the WAL record.
    8383             :  *
    8384             :  * The returned values are cached copies from backend-private memory, and
    8385             :  * possibly out-of-date.  XLogInsertRecord will re-check them against
    8386             :  * up-to-date values, while holding the WAL insert lock.
    8387             :  */
    8388             : void
    8389    29608684 : GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
    8390             : {
    8391    29608684 :     *RedoRecPtr_p = RedoRecPtr;
    8392    29608684 :     *doPageWrites_p = doPageWrites;
    8393    29608684 : }
    8394             : 
    8395             : /*
    8396             :  * GetInsertRecPtr -- Returns the current insert position.
    8397             :  *
    8398             :  * NOTE: The value *actually* returned is the position of the last full
    8399             :  * xlog page. It lags behind the real insert position by at most 1 page.
    8400             :  * For that, we don't need to scan through WAL insertion locks, and an
    8401             :  * approximation is enough for the current usage of this function.
    8402             :  */
    8403             : XLogRecPtr
    8404        2102 : GetInsertRecPtr(void)
    8405             : {
    8406             :     XLogRecPtr  recptr;
    8407             : 
    8408        2102 :     SpinLockAcquire(&XLogCtl->info_lck);
    8409        2102 :     recptr = XLogCtl->LogwrtRqst.Write;
    8410        2102 :     SpinLockRelease(&XLogCtl->info_lck);
    8411             : 
    8412        2102 :     return recptr;
    8413             : }
    8414             : 
    8415             : /*
    8416             :  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
    8417             :  * position known to be fsync'd to disk.
    8418             :  */
    8419             : XLogRecPtr
    8420      159340 : GetFlushRecPtr(void)
    8421             : {
    8422      159340 :     SpinLockAcquire(&XLogCtl->info_lck);
    8423      159340 :     LogwrtResult = XLogCtl->LogwrtResult;
    8424      159340 :     SpinLockRelease(&XLogCtl->info_lck);
    8425             : 
    8426      159340 :     return LogwrtResult.Flush;
    8427             : }
    8428             : 
    8429             : /*
    8430             :  * GetLastImportantRecPtr -- Returns the LSN of the last important record
    8431             :  * inserted. All records not explicitly marked as unimportant are considered
    8432             :  * important.
    8433             :  *
    8434             :  * The LSN is determined by computing the maximum of
    8435             :  * WALInsertLocks[i].lastImportantAt.
    8436             :  */
    8437             : XLogRecPtr
    8438        3204 : GetLastImportantRecPtr(void)
    8439             : {
    8440        3204 :     XLogRecPtr  res = InvalidXLogRecPtr;
    8441             :     int         i;
    8442             : 
    8443       28836 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    8444             :     {
    8445             :         XLogRecPtr  last_important;
    8446             : 
    8447             :         /*
    8448             :          * Need to take a lock to prevent torn reads of the LSN, which are
    8449             :          * possible on some of the supported platforms. WAL insert locks only
    8450             :          * support exclusive mode, so we have to use that.
    8451             :          */
    8452       25632 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    8453       25632 :         last_important = WALInsertLocks[i].l.lastImportantAt;
    8454       25632 :         LWLockRelease(&WALInsertLocks[i].l.lock);
    8455             : 
    8456       25632 :         if (res < last_important)
    8457        3748 :             res = last_important;
    8458             :     }
    8459             : 
    8460        3204 :     return res;
    8461             : }
    8462             : 
    8463             : /*
    8464             :  * Get the time and LSN of the last xlog segment switch
    8465             :  */
    8466             : pg_time_t
    8467           0 : GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
    8468             : {
    8469             :     pg_time_t   result;
    8470             : 
    8471             :     /* Need WALWriteLock, but shared lock is sufficient */
    8472           0 :     LWLockAcquire(WALWriteLock, LW_SHARED);
    8473           0 :     result = XLogCtl->lastSegSwitchTime;
    8474           0 :     *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
    8475           0 :     LWLockRelease(WALWriteLock);
    8476             : 
    8477           0 :     return result;
    8478             : }
    8479             : 
    8480             : /*
    8481             :  * This must be called ONCE during postmaster or standalone-backend shutdown
    8482             :  */
    8483             : void
    8484        1126 : ShutdownXLOG(int code, Datum arg)
    8485             : {
    8486             :     /*
    8487             :      * We should have an aux process resource owner to use, and we should not
    8488             :      * be in a transaction that's installed some other resowner.
    8489             :      */
    8490             :     Assert(AuxProcessResourceOwner != NULL);
    8491             :     Assert(CurrentResourceOwner == NULL ||
    8492             :            CurrentResourceOwner == AuxProcessResourceOwner);
    8493        1126 :     CurrentResourceOwner = AuxProcessResourceOwner;
    8494             : 
    8495             :     /* Don't be chatty in standalone mode */
    8496        1126 :     ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    8497             :             (errmsg("shutting down")));
    8498             : 
    8499             :     /*
    8500             :      * Signal walsenders to move to stopping state.
    8501             :      */
    8502        1126 :     WalSndInitStopping();
    8503             : 
    8504             :     /*
    8505             :      * Wait for WAL senders to be in stopping state.  This prevents commands
    8506             :      * from writing new WAL.
    8507             :      */
    8508        1126 :     WalSndWaitStopping();
    8509             : 
    8510        1126 :     if (RecoveryInProgress())
    8511          32 :         CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    8512             :     else
    8513             :     {
    8514             :         /*
    8515             :          * If archiving is enabled, rotate the last XLOG file so that all the
    8516             :          * remaining records are archived (postmaster wakes up the archiver
    8517             :          * process one more time at the end of shutdown). The checkpoint
    8518             :          * record will go to the next XLOG file and won't be archived (yet).
    8519             :          */
    8520        1094 :         if (XLogArchivingActive() && XLogArchiveCommandSet())
    8521           0 :             RequestXLogSwitch(false);
    8522             : 
    8523        1094 :         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    8524             :     }
    8525        1126 :     ShutdownCLOG();
    8526        1126 :     ShutdownCommitTs();
    8527        1126 :     ShutdownSUBTRANS();
    8528        1126 :     ShutdownMultiXact();
    8529        1126 : }
    8530             : 
    8531             : /*
    8532             :  * Log start of a checkpoint.
    8533             :  */
    8534             : static void
    8535         480 : LogCheckpointStart(int flags, bool restartpoint)
    8536             : {
    8537         480 :     elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
    8538             :          restartpoint ? "restartpoint" : "checkpoint",
    8539             :          (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
    8540             :          (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
    8541             :          (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
    8542             :          (flags & CHECKPOINT_FORCE) ? " force" : "",
    8543             :          (flags & CHECKPOINT_WAIT) ? " wait" : "",
    8544             :          (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
    8545             :          (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
    8546             :          (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
    8547         480 : }
    8548             : 
    8549             : /*
    8550             :  * Log end of a checkpoint.
    8551             :  */
    8552             : static void
    8553        3172 : LogCheckpointEnd(bool restartpoint)
    8554             : {
    8555             :     long        write_secs,
    8556             :                 sync_secs,
    8557             :                 total_secs,
    8558             :                 longest_secs,
    8559             :                 average_secs;
    8560             :     int         write_usecs,
    8561             :                 sync_usecs,
    8562             :                 total_usecs,
    8563             :                 longest_usecs,
    8564             :                 average_usecs;
    8565             :     uint64      average_sync_time;
    8566             : 
    8567        3172 :     CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
    8568             : 
    8569        3172 :     TimestampDifference(CheckpointStats.ckpt_write_t,
    8570             :                         CheckpointStats.ckpt_sync_t,
    8571             :                         &write_secs, &write_usecs);
    8572             : 
    8573        3172 :     TimestampDifference(CheckpointStats.ckpt_sync_t,
    8574             :                         CheckpointStats.ckpt_sync_end_t,
    8575             :                         &sync_secs, &sync_usecs);
    8576             : 
    8577             :     /* Accumulate checkpoint timing summary data, in milliseconds. */
    8578        6344 :     BgWriterStats.m_checkpoint_write_time +=
    8579        3172 :         write_secs * 1000 + write_usecs / 1000;
    8580        6344 :     BgWriterStats.m_checkpoint_sync_time +=
    8581        3172 :         sync_secs * 1000 + sync_usecs / 1000;
    8582             : 
    8583             :     /*
    8584             :      * All of the published timing statistics are accounted for.  Only
    8585             :      * continue if a log message is to be written.
    8586             :      */
    8587        3172 :     if (!log_checkpoints)
    8588        2692 :         return;
    8589             : 
    8590         480 :     TimestampDifference(CheckpointStats.ckpt_start_t,
    8591             :                         CheckpointStats.ckpt_end_t,
    8592             :                         &total_secs, &total_usecs);
    8593             : 
    8594             :     /*
    8595             :      * Timing values returned from CheckpointStats are in microseconds.
    8596             :      * Convert to the second plus microsecond form that TimestampDifference
    8597             :      * returns for homogeneous printing.
    8598             :      */
    8599         480 :     longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
    8600         480 :     longest_usecs = CheckpointStats.ckpt_longest_sync -
    8601             :         (uint64) longest_secs * 1000000;
    8602             : 
    8603         480 :     average_sync_time = 0;
    8604         480 :     if (CheckpointStats.ckpt_sync_rels > 0)
    8605           0 :         average_sync_time = CheckpointStats.ckpt_agg_sync_time /
    8606           0 :             CheckpointStats.ckpt_sync_rels;
    8607         480 :     average_secs = (long) (average_sync_time / 1000000);
    8608         480 :     average_usecs = average_sync_time - (uint64) average_secs * 1000000;
    8609             : 
    8610         480 :     elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
    8611             :          "%d WAL file(s) added, %d removed, %d recycled; "
    8612             :          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
    8613             :          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
    8614             :          "distance=%d kB, estimate=%d kB",
    8615             :          restartpoint ? "restartpoint" : "checkpoint",
    8616             :          CheckpointStats.ckpt_bufs_written,
    8617             :          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
    8618             :          CheckpointStats.ckpt_segs_added,
    8619             :          CheckpointStats.ckpt_segs_removed,
    8620             :          CheckpointStats.ckpt_segs_recycled,
    8621             :          write_secs, write_usecs / 1000,
    8622             :          sync_secs, sync_usecs / 1000,
    8623             :          total_secs, total_usecs / 1000,
    8624             :          CheckpointStats.ckpt_sync_rels,
    8625             :          longest_secs, longest_usecs / 1000,
    8626             :          average_secs, average_usecs / 1000,
    8627             :          (int) (PrevCheckPointDistance / 1024.0),
    8628             :          (int) (CheckPointDistanceEstimate / 1024.0));
    8629             : }
    8630             : 
    8631             : /*
    8632             :  * Update the estimate of distance between checkpoints.
    8633             :  *
    8634             :  * The estimate is used to calculate the number of WAL segments to keep
    8635             :  * preallocated, see XLOGfileslop().
    8636             :  */
    8637             : static void
    8638        3172 : UpdateCheckPointDistanceEstimate(uint64 nbytes)
    8639             : {
    8640             :     /*
    8641             :      * To estimate the number of segments consumed between checkpoints, keep a
    8642             :      * moving average of the amount of WAL generated in previous checkpoint
    8643             :      * cycles. However, if the load is bursty, with quiet periods and busy
    8644             :      * periods, we want to cater for the peak load. So instead of a plain
    8645             :      * moving average, let the average decline slowly if the previous cycle
    8646             :      * used less WAL than estimated, but bump it up immediately if it used
    8647             :      * more.
    8648             :      *
    8649             :      * When checkpoints are triggered by max_wal_size, this should converge to
    8650             :      * CheckpointSegments * wal_segment_size,
    8651             :      *
    8652             :      * Note: This doesn't pay any attention to what caused the checkpoint.
    8653             :      * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
    8654             :      * starting a base backup, are counted the same as those created
    8655             :      * automatically. The slow-decline will largely mask them out, if they are
    8656             :      * not frequent. If they are frequent, it seems reasonable to count them
    8657             :      * in as any others; if you issue a manual checkpoint every 5 minutes and
    8658             :      * never let a timed checkpoint happen, it makes sense to base the
    8659             :      * preallocation on that 5 minute interval rather than whatever
    8660             :      * checkpoint_timeout is set to.
    8661             :      */
    8662        3172 :     PrevCheckPointDistance = nbytes;
    8663        3172 :     if (CheckPointDistanceEstimate < nbytes)
    8664        1408 :         CheckPointDistanceEstimate = nbytes;
    8665             :     else
    8666        1764 :         CheckPointDistanceEstimate =
    8667        1764 :             (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
    8668        3172 : }
    8669             : 
    8670             : /*
    8671             :  * Perform a checkpoint --- either during shutdown, or on-the-fly
    8672             :  *
    8673             :  * flags is a bitwise OR of the following:
    8674             :  *  CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
    8675             :  *  CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
    8676             :  *  CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
    8677             :  *      ignoring checkpoint_completion_target parameter.
    8678             :  *  CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
    8679             :  *      since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
    8680             :  *      CHECKPOINT_END_OF_RECOVERY).
    8681             :  *  CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
    8682             :  *
    8683             :  * Note: flags contains other bits, of interest here only for logging purposes.
    8684             :  * In particular note that this routine is synchronous and does not pay
    8685             :  * attention to CHECKPOINT_WAIT.
    8686             :  *
    8687             :  * If !shutdown then we are writing an online checkpoint. This is a very special
    8688             :  * kind of operation and WAL record because the checkpoint action occurs over
    8689             :  * a period of time yet logically occurs at just a single LSN. The logical
    8690             :  * position of the WAL record (redo ptr) is the same or earlier than the
    8691             :  * physical position. When we replay WAL we locate the checkpoint via its
    8692             :  * physical position then read the redo ptr and actually start replay at the
    8693             :  * earlier logical position. Note that we don't write *anything* to WAL at
    8694             :  * the logical position, so that location could be any other kind of WAL record.
    8695             :  * All of this mechanism allows us to continue working while we checkpoint.
    8696             :  * As a result, timing of actions is critical here and be careful to note that
    8697             :  * this function will likely take minutes to execute on a busy system.
    8698             :  */
    8699             : void
    8700        3142 : CreateCheckPoint(int flags)
    8701             : {
    8702             :     bool        shutdown;
    8703             :     CheckPoint  checkPoint;
    8704             :     XLogRecPtr  recptr;
    8705             :     XLogSegNo   _logSegNo;
    8706        3142 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    8707             :     uint32      freespace;
    8708             :     XLogRecPtr  PriorRedoPtr;
    8709             :     XLogRecPtr  curInsert;
    8710             :     XLogRecPtr  last_important_lsn;
    8711             :     VirtualTransactionId *vxids;
    8712             :     int         nvxids;
    8713             : 
    8714             :     /*
    8715             :      * An end-of-recovery checkpoint is really a shutdown checkpoint, just
    8716             :      * issued at a different time.
    8717             :      */
    8718        3142 :     if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
    8719        1208 :         shutdown = true;
    8720             :     else
    8721        1934 :         shutdown = false;
    8722             : 
    8723             :     /* sanity check */
    8724        3142 :     if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
    8725           0 :         elog(ERROR, "can't create a checkpoint during recovery");
    8726             : 
    8727             :     /*
    8728             :      * Initialize InitXLogInsert working areas before entering the critical
    8729             :      * section.  Normally, this is done by the first call to
    8730             :      * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
    8731             :      * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
    8732             :      * done below in a critical section, and InitXLogInsert cannot be called
    8733             :      * in a critical section.
    8734             :      */
    8735        3142 :     InitXLogInsert();
    8736             : 
    8737             :     /*
    8738             :      * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
    8739             :      * (This is just pro forma, since in the present system structure there is
    8740             :      * only one process that is allowed to issue checkpoints at any given
    8741             :      * time.)
    8742             :      */
    8743        3142 :     LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
    8744             : 
    8745             :     /*
    8746             :      * Prepare to accumulate statistics.
    8747             :      *
    8748             :      * Note: because it is possible for log_checkpoints to change while a
    8749             :      * checkpoint proceeds, we always accumulate stats, even if
    8750             :      * log_checkpoints is currently off.
    8751             :      */
    8752       34562 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    8753        3142 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    8754             : 
    8755             :     /*
    8756             :      * Use a critical section to force system panic if we have trouble.
    8757             :      */
    8758        3142 :     START_CRIT_SECTION();
    8759             : 
    8760        3142 :     if (shutdown)
    8761             :     {
    8762        1208 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8763        1208 :         ControlFile->state = DB_SHUTDOWNING;
    8764        1208 :         ControlFile->time = (pg_time_t) time(NULL);
    8765        1208 :         UpdateControlFile();
    8766        1208 :         LWLockRelease(ControlFileLock);
    8767             :     }
    8768             : 
    8769             :     /*
    8770             :      * Let smgr prepare for checkpoint; this has to happen before we determine
    8771             :      * the REDO pointer.  Note that smgr must not do anything that'd have to
    8772             :      * be undone if we decide no checkpoint is needed.
    8773             :      */
    8774        3142 :     SyncPreCheckpoint();
    8775             : 
    8776             :     /* Begin filling in the checkpoint WAL record */
    8777       37704 :     MemSet(&checkPoint, 0, sizeof(checkPoint));
    8778        3142 :     checkPoint.time = (pg_time_t) time(NULL);
    8779             : 
    8780             :     /*
    8781             :      * For Hot Standby, derive the oldestActiveXid before we fix the redo
    8782             :      * pointer. This allows us to begin accumulating changes to assemble our
    8783             :      * starting snapshot of locks and transactions.
    8784             :      */
    8785        3142 :     if (!shutdown && XLogStandbyInfoActive())
    8786        1896 :         checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
    8787             :     else
    8788        1246 :         checkPoint.oldestActiveXid = InvalidTransactionId;
    8789             : 
    8790             :     /*
    8791             :      * Get location of last important record before acquiring insert locks (as
    8792             :      * GetLastImportantRecPtr() also locks WAL locks).
    8793             :      */
    8794        3142 :     last_important_lsn = GetLastImportantRecPtr();
    8795             : 
    8796             :     /*
    8797             :      * We must block concurrent insertions while examining insert state to
    8798             :      * determine the checkpoint REDO pointer.
    8799             :      */
    8800        3142 :     WALInsertLockAcquireExclusive();
    8801        3142 :     curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
    8802             : 
    8803             :     /*
    8804             :      * If this isn't a shutdown or forced checkpoint, and if there has been no
    8805             :      * WAL activity requiring a checkpoint, skip it.  The idea here is to
    8806             :      * avoid inserting duplicate checkpoints when the system is idle.
    8807             :      */
    8808        3142 :     if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
    8809             :                   CHECKPOINT_FORCE)) == 0)
    8810             :     {
    8811          12 :         if (last_important_lsn == ControlFile->checkPoint)
    8812             :         {
    8813           0 :             WALInsertLockRelease();
    8814           0 :             LWLockRelease(CheckpointLock);
    8815           0 :             END_CRIT_SECTION();
    8816           0 :             ereport(DEBUG1,
    8817             :                     (errmsg("checkpoint skipped because system is idle")));
    8818           0 :             return;
    8819             :         }
    8820             :     }
    8821             : 
    8822             :     /*
    8823             :      * An end-of-recovery checkpoint is created before anyone is allowed to
    8824             :      * write WAL. To allow us to write the checkpoint record, temporarily
    8825             :      * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
    8826             :      * initialized, which we need here and in AdvanceXLInsertBuffer.)
    8827             :      */
    8828        3142 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    8829         114 :         LocalSetXLogInsertAllowed();
    8830             : 
    8831        3142 :     checkPoint.ThisTimeLineID = ThisTimeLineID;
    8832        3142 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    8833         114 :         checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    8834             :     else
    8835        3028 :         checkPoint.PrevTimeLineID = ThisTimeLineID;
    8836             : 
    8837        3142 :     checkPoint.fullPageWrites = Insert->fullPageWrites;
    8838             : 
    8839             :     /*
    8840             :      * Compute new REDO record ptr = location of next XLOG record.
    8841             :      *
    8842             :      * NB: this is NOT necessarily where the checkpoint record itself will be,
    8843             :      * since other backends may insert more XLOG records while we're off doing
    8844             :      * the buffer flush work.  Those XLOG records are logically after the
    8845             :      * checkpoint, even though physically before it.  Got that?
    8846             :      */
    8847        3142 :     freespace = INSERT_FREESPACE(curInsert);
    8848        3142 :     if (freespace == 0)
    8849             :     {
    8850           0 :         if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
    8851           0 :             curInsert += SizeOfXLogLongPHD;
    8852             :         else
    8853           0 :             curInsert += SizeOfXLogShortPHD;
    8854             :     }
    8855        3142 :     checkPoint.redo = curInsert;
    8856             : 
    8857             :     /*
    8858             :      * Here we update the shared RedoRecPtr for future XLogInsert calls; this
    8859             :      * must be done while holding all the insertion locks.
    8860             :      *
    8861             :      * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
    8862             :      * pointing past where it really needs to point.  This is okay; the only
    8863             :      * consequence is that XLogInsert might back up whole buffers that it
    8864             :      * didn't really need to.  We can't postpone advancing RedoRecPtr because
    8865             :      * XLogInserts that happen while we are dumping buffers must assume that
    8866             :      * their buffer changes are not included in the checkpoint.
    8867             :      */
    8868        3142 :     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    8869             : 
    8870             :     /*
    8871             :      * Now we can release the WAL insertion locks, allowing other xacts to
    8872             :      * proceed while we are flushing disk buffers.
    8873             :      */
    8874        3142 :     WALInsertLockRelease();
    8875             : 
    8876             :     /* Update the info_lck-protected copy of RedoRecPtr as well */
    8877        3142 :     SpinLockAcquire(&XLogCtl->info_lck);
    8878        3142 :     XLogCtl->RedoRecPtr = checkPoint.redo;
    8879        3142 :     SpinLockRelease(&XLogCtl->info_lck);
    8880             : 
    8881             :     /*
    8882             :      * If enabled, log checkpoint start.  We postpone this until now so as not
    8883             :      * to log anything if we decided to skip the checkpoint.
    8884             :      */
    8885        3142 :     if (log_checkpoints)
    8886         462 :         LogCheckpointStart(flags, false);
    8887             : 
    8888             :     TRACE_POSTGRESQL_CHECKPOINT_START(flags);
    8889             : 
    8890             :     /*
    8891             :      * Get the other info we need for the checkpoint record.
    8892             :      *
    8893             :      * We don't need to save oldestClogXid in the checkpoint, it only matters
    8894             :      * for the short period in which clog is being truncated, and if we crash
    8895             :      * during that we'll redo the clog truncation and fix up oldestClogXid
    8896             :      * there.
    8897             :      */
    8898        3142 :     LWLockAcquire(XidGenLock, LW_SHARED);
    8899        3142 :     checkPoint.nextFullXid = ShmemVariableCache->nextFullXid;
    8900        3142 :     checkPoint.oldestXid = ShmemVariableCache->oldestXid;
    8901        3142 :     checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
    8902        3142 :     LWLockRelease(XidGenLock);
    8903             : 
    8904        3142 :     LWLockAcquire(CommitTsLock, LW_SHARED);
    8905        3142 :     checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
    8906        3142 :     checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
    8907        3142 :     LWLockRelease(CommitTsLock);
    8908             : 
    8909        3142 :     LWLockAcquire(OidGenLock, LW_SHARED);
    8910        3142 :     checkPoint.nextOid = ShmemVariableCache->nextOid;
    8911        3142 :     if (!shutdown)
    8912        1934 :         checkPoint.nextOid += ShmemVariableCache->oidCount;
    8913        3142 :     LWLockRelease(OidGenLock);
    8914             : 
    8915        3142 :     MultiXactGetCheckptMulti(shutdown,
    8916             :                              &checkPoint.nextMulti,
    8917             :                              &checkPoint.nextMultiOffset,
    8918             :                              &checkPoint.oldestMulti,
    8919             :                              &checkPoint.oldestMultiDB);
    8920             : 
    8921             :     /*
    8922             :      * Having constructed the checkpoint record, ensure all shmem disk buffers
    8923             :      * and commit-log buffers are flushed to disk.
    8924             :      *
    8925             :      * This I/O could fail for various reasons.  If so, we will fail to
    8926             :      * complete the checkpoint, but there is no reason to force a system
    8927             :      * panic. Accordingly, exit critical section while doing it.
    8928             :      */
    8929        3142 :     END_CRIT_SECTION();
    8930             : 
    8931             :     /*
    8932             :      * In some cases there are groups of actions that must all occur on one
    8933             :      * side or the other of a checkpoint record. Before flushing the
    8934             :      * checkpoint record we must explicitly wait for any backend currently
    8935             :      * performing those groups of actions.
    8936             :      *
    8937             :      * One example is end of transaction, so we must wait for any transactions
    8938             :      * that are currently in commit critical sections.  If an xact inserted
    8939             :      * its commit record into XLOG just before the REDO point, then a crash
    8940             :      * restart from the REDO point would not replay that record, which means
    8941             :      * that our flushing had better include the xact's update of pg_xact.  So
    8942             :      * we wait till he's out of his commit critical section before proceeding.
    8943             :      * See notes in RecordTransactionCommit().
    8944             :      *
    8945             :      * Because we've already released the insertion locks, this test is a bit
    8946             :      * fuzzy: it is possible that we will wait for xacts we didn't really need
    8947             :      * to wait for.  But the delay should be short and it seems better to make
    8948             :      * checkpoint take a bit longer than to hold off insertions longer than
    8949             :      * necessary. (In fact, the whole reason we have this issue is that xact.c
    8950             :      * does commit record XLOG insertion and clog update as two separate steps
    8951             :      * protected by different locks, but again that seems best on grounds of
    8952             :      * minimizing lock contention.)
    8953             :      *
    8954             :      * A transaction that has not yet set delayChkpt when we look cannot be at
    8955             :      * risk, since he's not inserted his commit record yet; and one that's
    8956             :      * already cleared it is not at risk either, since he's done fixing clog
    8957             :      * and we will correctly flush the update below.  So we cannot miss any
    8958             :      * xacts we need to wait for.
    8959             :      */
    8960        3142 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
    8961        3142 :     if (nvxids > 0)
    8962             :     {
    8963             :         do
    8964             :         {
    8965           4 :             pg_usleep(10000L);  /* wait for 10 msec */
    8966           4 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
    8967             :     }
    8968        3142 :     pfree(vxids);
    8969             : 
    8970        3142 :     CheckPointGuts(checkPoint.redo, flags);
    8971             : 
    8972             :     /*
    8973             :      * Take a snapshot of running transactions and write this to WAL. This
    8974             :      * allows us to reconstruct the state of running transactions during
    8975             :      * archive recovery, if required. Skip, if this info disabled.
    8976             :      *
    8977             :      * If we are shutting down, or Startup process is completing crash
    8978             :      * recovery we don't need to write running xact data.
    8979             :      */
    8980        3142 :     if (!shutdown && XLogStandbyInfoActive())
    8981        1896 :         LogStandbySnapshot();
    8982             : 
    8983        3142 :     START_CRIT_SECTION();
    8984             : 
    8985             :     /*
    8986             :      * Now insert the checkpoint record into XLOG.
    8987             :      */
    8988        3142 :     XLogBeginInsert();
    8989        3142 :     XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
    8990        3142 :     recptr = XLogInsert(RM_XLOG_ID,
    8991             :                         shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
    8992             :                         XLOG_CHECKPOINT_ONLINE);
    8993             : 
    8994        3142 :     XLogFlush(recptr);
    8995             : 
    8996             :     /*
    8997             :      * We mustn't write any new WAL after a shutdown checkpoint, or it will be
    8998             :      * overwritten at next startup.  No-one should even try, this just allows
    8999             :      * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
    9000             :      * to just temporarily disable writing until the system has exited
    9001             :      * recovery.
    9002             :      */
    9003        3142 :     if (shutdown)
    9004             :     {
    9005        1208 :         if (flags & CHECKPOINT_END_OF_RECOVERY)
    9006         114 :             LocalXLogInsertAllowed = -1;    /* return to "check" state */
    9007             :         else
    9008        1094 :             LocalXLogInsertAllowed = 0; /* never again write WAL */
    9009             :     }
    9010             : 
    9011             :     /*
    9012             :      * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
    9013             :      * = end of actual checkpoint record.
    9014             :      */
    9015        3142 :     if (shutdown && checkPoint.redo != ProcLastRecPtr)
    9016           0 :         ereport(PANIC,
    9017             :                 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
    9018             : 
    9019             :     /*
    9020             :      * Remember the prior checkpoint's redo ptr for
    9021             :      * UpdateCheckPointDistanceEstimate()
    9022             :      */
    9023        3142 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    9024             : 
    9025             :     /*
    9026             :      * Update the control file.
    9027             :      */
    9028        3142 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9029        3142 :     if (shutdown)
    9030        1208 :         ControlFile->state = DB_SHUTDOWNED;
    9031        3142 :     ControlFile->checkPoint = ProcLastRecPtr;
    9032        3142 :     ControlFile->checkPointCopy = checkPoint;
    9033        3142 :     ControlFile->time = (pg_time_t) time(NULL);
    9034             :     /* crash recovery should always recover to the end of WAL */
    9035        3142 :     ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
    9036        3142 :     ControlFile->minRecoveryPointTLI = 0;
    9037             : 
    9038             :     /*
    9039             :      * Persist unloggedLSN value. It's reset on crash recovery, so this goes
    9040             :      * unused on non-shutdown checkpoints, but seems useful to store it always
    9041             :      * for debugging purposes.
    9042             :      */
    9043        3142 :     SpinLockAcquire(&XLogCtl->ulsn_lck);
    9044        3142 :     ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
    9045        3142 :     SpinLockRelease(&XLogCtl->ulsn_lck);
    9046             : 
    9047        3142 :     UpdateControlFile();
    9048        3142 :     LWLockRelease(ControlFileLock);
    9049             : 
    9050             :     /* Update shared-memory copy of checkpoint XID/epoch */
    9051        3142 :     SpinLockAcquire(&XLogCtl->info_lck);
    9052        3142 :     XLogCtl->ckptFullXid = checkPoint.nextFullXid;
    9053        3142 :     SpinLockRelease(&XLogCtl->info_lck);
    9054             : 
    9055             :     /*
    9056             :      * We are now done with critical updates; no need for system panic if we
    9057             :      * have trouble while fooling with old log segments.
    9058             :      */
    9059        3142 :     END_CRIT_SECTION();
    9060             : 
    9061             :     /*
    9062             :      * Let smgr do post-checkpoint cleanup (eg, deleting old files).
    9063             :      */
    9064        3142 :     SyncPostCheckpoint();
    9065             : 
    9066             :     /*
    9067             :      * Update the average distance between checkpoints if the prior checkpoint
    9068             :      * exists.
    9069             :      */
    9070        3142 :     if (PriorRedoPtr != InvalidXLogRecPtr)
    9071        3142 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    9072             : 
    9073             :     /*
    9074             :      * Delete old log files, those no longer needed for last checkpoint to
    9075             :      * prevent the disk holding the xlog from growing full.
    9076             :      */
    9077        3142 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    9078        3142 :     KeepLogSeg(recptr, &_logSegNo);
    9079        3142 :     InvalidateObsoleteReplicationSlots(_logSegNo);
    9080        3142 :     _logSegNo--;
    9081        3142 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
    9082             : 
    9083             :     /*
    9084             :      * Make more log segments if needed.  (Do this after recycling old log
    9085             :      * segments, since that may supply some of the needed files.)
    9086             :      */
    9087        3142 :     if (!shutdown)
    9088        1934 :         PreallocXlogFiles(recptr);
    9089             : 
    9090             :     /*
    9091             :      * Truncate pg_subtrans if possible.  We can throw away all data before
    9092             :      * the oldest XMIN of any running transaction.  No future transaction will
    9093             :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    9094             :      * in subtrans.c).  During recovery, though, we mustn't do this because
    9095             :      * StartupSUBTRANS hasn't been called yet.
    9096             :      */
    9097        3142 :     if (!RecoveryInProgress())
    9098        3028 :         TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
    9099             : 
    9100             :     /* Real work is done, but log and update stats before releasing lock. */
    9101        3142 :     LogCheckpointEnd(false);
    9102             : 
    9103             :     TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
    9104             :                                      NBuffers,
    9105             :                                      CheckpointStats.ckpt_segs_added,
    9106             :                                      CheckpointStats.ckpt_segs_removed,
    9107             :                                      CheckpointStats.ckpt_segs_recycled);
    9108             : 
    9109        3142 :     LWLockRelease(CheckpointLock);
    9110             : }
    9111             : 
    9112             : /*
    9113             :  * Mark the end of recovery in WAL though without running a full checkpoint.
    9114             :  * We can expect that a restartpoint is likely to be in progress as we
    9115             :  * do this, though we are unwilling to wait for it to complete. So be
    9116             :  * careful to avoid taking the CheckpointLock anywhere here.
    9117             :  *
    9118             :  * CreateRestartPoint() allows for the case where recovery may end before
    9119             :  * the restartpoint completes so there is no concern of concurrent behaviour.
    9120             :  */
    9121             : static void
    9122          48 : CreateEndOfRecoveryRecord(void)
    9123             : {
    9124             :     xl_end_of_recovery xlrec;
    9125             :     XLogRecPtr  recptr;
    9126             : 
    9127             :     /* sanity check */
    9128          48 :     if (!RecoveryInProgress())
    9129           0 :         elog(ERROR, "can only be used to end recovery");
    9130             : 
    9131          48 :     xlrec.end_time = GetCurrentTimestamp();
    9132             : 
    9133          48 :     WALInsertLockAcquireExclusive();
    9134          48 :     xlrec.ThisTimeLineID = ThisTimeLineID;
    9135          48 :     xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    9136          48 :     WALInsertLockRelease();
    9137             : 
    9138          48 :     LocalSetXLogInsertAllowed();
    9139             : 
    9140          48 :     START_CRIT_SECTION();
    9141             : 
    9142          48 :     XLogBeginInsert();
    9143          48 :     XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
    9144          48 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
    9145             : 
    9146          48 :     XLogFlush(recptr);
    9147             : 
    9148             :     /*
    9149             :      * Update the control file so that crash recovery can follow the timeline
    9150             :      * changes to this point.
    9151             :      */
    9152          48 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9153          48 :     ControlFile->time = (pg_time_t) time(NULL);
    9154          48 :     ControlFile->minRecoveryPoint = recptr;
    9155          48 :     ControlFile->minRecoveryPointTLI = ThisTimeLineID;
    9156          48 :     UpdateControlFile();
    9157          48 :     LWLockRelease(ControlFileLock);
    9158             : 
    9159          48 :     END_CRIT_SECTION();
    9160             : 
    9161          48 :     LocalXLogInsertAllowed = -1;    /* return to "check" state */
    9162          48 : }
    9163             : 
    9164             : /*
    9165             :  * Flush all data in shared memory to disk, and fsync
    9166             :  *
    9167             :  * This is the common code shared between regular checkpoints and
    9168             :  * recovery restartpoints.
    9169             :  */
    9170             : static void
    9171        3172 : CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
    9172             : {
    9173        3172 :     CheckPointCLOG();
    9174        3172 :     CheckPointCommitTs();
    9175        3172 :     CheckPointSUBTRANS();
    9176        3172 :     CheckPointMultiXact();
    9177        3172 :     CheckPointPredicate();
    9178        3172 :     CheckPointRelationMap();
    9179        3172 :     CheckPointReplicationSlots();
    9180        3172 :     CheckPointSnapBuild();
    9181        3172 :     CheckPointLogicalRewriteHeap();
    9182        3172 :     CheckPointBuffers(flags);   /* performs all required fsyncs */
    9183        3172 :     CheckPointReplicationOrigin();
    9184             :     /* We deliberately delay 2PC checkpointing as long as possible */
    9185        3172 :     CheckPointTwoPhase(checkPointRedo);
    9186        3172 : }
    9187             : 
    9188             : /*
    9189             :  * Save a checkpoint for recovery restart if appropriate
    9190             :  *
    9191             :  * This function is called each time a checkpoint record is read from XLOG.
    9192             :  * It must determine whether the checkpoint represents a safe restartpoint or
    9193             :  * not.  If so, the checkpoint record is stashed in shared memory so that
    9194             :  * CreateRestartPoint can consult it.  (Note that the latter function is
    9195             :  * executed by the checkpointer, while this one will be executed by the
    9196             :  * startup process.)
    9197             :  */
    9198             : static void
    9199         202 : RecoveryRestartPoint(const CheckPoint *checkPoint)
    9200             : {
    9201             :     /*
    9202             :      * Also refrain from creating a restartpoint if we have seen any
    9203             :      * references to non-existent pages. Restarting recovery from the
    9204             :      * restartpoint would not see the references, so we would lose the
    9205             :      * cross-check that the pages belonged to a relation that was dropped
    9206             :      * later.
    9207             :      */
    9208         202 :     if (XLogHaveInvalidPages())
    9209             :     {
    9210           0 :         elog(trace_recovery(DEBUG2),
    9211             :              "could not record restart point at %X/%X because there "
    9212             :              "are unresolved references to invalid pages",
    9213             :              (uint32) (checkPoint->redo >> 32),
    9214             :              (uint32) checkPoint->redo);
    9215           0 :         return;
    9216             :     }
    9217             : 
    9218             :     /*
    9219             :      * Copy the checkpoint record to shared memory, so that checkpointer can
    9220             :      * work out the next time it wants to perform a restartpoint.
    9221             :      */
    9222         202 :     SpinLockAcquire(&XLogCtl->info_lck);
    9223         202 :     XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
    9224         202 :     XLogCtl->lastCheckPointEndPtr = EndRecPtr;
    9225         202 :     XLogCtl->lastCheckPoint = *checkPoint;
    9226         202 :     SpinLockRelease(&XLogCtl->info_lck);
    9227             : }
    9228             : 
    9229             : /*
    9230             :  * Establish a restartpoint if possible.
    9231             :  *
    9232             :  * This is similar to CreateCheckPoint, but is used during WAL recovery
    9233             :  * to establish a point from which recovery can roll forward without
    9234             :  * replaying the entire recovery log.
    9235             :  *
    9236             :  * Returns true if a new restartpoint was established. We can only establish
    9237             :  * a restartpoint if we have replayed a safe checkpoint record since last
    9238             :  * restartpoint.
    9239             :  */
    9240             : bool
    9241          76 : CreateRestartPoint(int flags)
    9242             : {
    9243             :     XLogRecPtr  lastCheckPointRecPtr;
    9244             :     XLogRecPtr  lastCheckPointEndPtr;
    9245             :     CheckPoint  lastCheckPoint;
    9246             :     XLogRecPtr  PriorRedoPtr;
    9247             :     XLogRecPtr  receivePtr;
    9248             :     XLogRecPtr  replayPtr;
    9249             :     TimeLineID  replayTLI;
    9250             :     XLogRecPtr  endptr;
    9251             :     XLogSegNo   _logSegNo;
    9252             :     TimestampTz xtime;
    9253             : 
    9254             :     /*
    9255             :      * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
    9256             :      * happens at a time.
    9257             :      */
    9258          76 :     LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
    9259             : 
    9260             :     /* Get a local copy of the last safe checkpoint record. */
    9261          76 :     SpinLockAcquire(&XLogCtl->info_lck);
    9262          76 :     lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
    9263          76 :     lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
    9264          76 :     lastCheckPoint = XLogCtl->lastCheckPoint;
    9265          76 :     SpinLockRelease(&XLogCtl->info_lck);
    9266             : 
    9267             :     /*
    9268             :      * Check that we're still in recovery mode. It's ok if we exit recovery
    9269             :      * mode after this check, the restart point is valid anyway.
    9270             :      */
    9271          76 :     if (!RecoveryInProgress())
    9272             :     {
    9273           0 :         ereport(DEBUG2,
    9274             :                 (errmsg("skipping restartpoint, recovery has already ended")));
    9275           0 :         LWLockRelease(CheckpointLock);
    9276           0 :         return false;
    9277             :     }
    9278             : 
    9279             :     /*
    9280             :      * If the last checkpoint record we've replayed is already our last
    9281             :      * restartpoint, we can't perform a new restart point. We still update
    9282             :      * minRecoveryPoint in that case, so that if this is a shutdown restart
    9283             :      * point, we won't start up earlier than before. That's not strictly
    9284             :      * necessary, but when hot standby is enabled, it would be rather weird if
    9285             :      * the database opened up for read-only connections at a point-in-time
    9286             :      * before the last shutdown. Such time travel is still possible in case of
    9287             :      * immediate shutdown, though.
    9288             :      *
    9289             :      * We don't explicitly advance minRecoveryPoint when we do create a
    9290             :      * restartpoint. It's assumed that flushing the buffers will do that as a
    9291             :      * side-effect.
    9292             :      */
    9293          76 :     if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
    9294          74 :         lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
    9295             :     {
    9296          46 :         ereport(DEBUG2,
    9297             :                 (errmsg("skipping restartpoint, already performed at %X/%X",
    9298             :                         (uint32) (lastCheckPoint.redo >> 32),
    9299             :                         (uint32) lastCheckPoint.redo)));
    9300             : 
    9301          46 :         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    9302          46 :         if (flags & CHECKPOINT_IS_SHUTDOWN)
    9303             :         {
    9304          24 :             LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9305          24 :             ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    9306          24 :             ControlFile->time = (pg_time_t) time(NULL);
    9307          24 :             UpdateControlFile();
    9308          24 :             LWLockRelease(ControlFileLock);
    9309             :         }
    9310          46 :         LWLockRelease(CheckpointLock);
    9311          46 :         return false;
    9312             :     }
    9313             : 
    9314             :     /*
    9315             :      * Update the shared RedoRecPtr so that the startup process can calculate
    9316             :      * the number of segments replayed since last restartpoint, and request a
    9317             :      * restartpoint if it exceeds CheckPointSegments.
    9318             :      *
    9319             :      * Like in CreateCheckPoint(), hold off insertions to update it, although
    9320             :      * during recovery this is just pro forma, because no WAL insertions are
    9321             :      * happening.
    9322             :      */
    9323          30 :     WALInsertLockAcquireExclusive();
    9324          30 :     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
    9325          30 :     WALInsertLockRelease();
    9326             : 
    9327             :     /* Also update the info_lck-protected copy */
    9328          30 :     SpinLockAcquire(&XLogCtl->info_lck);
    9329          30 :     XLogCtl->RedoRecPtr = lastCheckPoint.redo;
    9330          30 :     SpinLockRelease(&XLogCtl->info_lck);
    9331             : 
    9332             :     /*
    9333             :      * Prepare to accumulate statistics.
    9334             :      *
    9335             :      * Note: because it is possible for log_checkpoints to change while a
    9336             :      * checkpoint proceeds, we always accumulate stats, even if
    9337             :      * log_checkpoints is currently off.
    9338             :      */
    9339         330 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    9340          30 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    9341             : 
    9342          30 :     if (log_checkpoints)
    9343          18 :         LogCheckpointStart(flags, true);
    9344             : 
    9345          30 :     CheckPointGuts(lastCheckPoint.redo, flags);
    9346             : 
    9347             :     /*
    9348             :      * Remember the prior checkpoint's redo ptr for
    9349             :      * UpdateCheckPointDistanceEstimate()
    9350             :      */
    9351          30 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    9352             : 
    9353             :     /*
    9354             :      * Update pg_control, using current time.  Check that it still shows
    9355             :      * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
    9356             :      * this is a quick hack to make sure nothing really bad happens if somehow
    9357             :      * we get here after the end-of-recovery checkpoint.
    9358             :      */
    9359          30 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9360          30 :     if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
    9361          30 :         ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
    9362             :     {
    9363          30 :         ControlFile->checkPoint = lastCheckPointRecPtr;
    9364          30 :         ControlFile->checkPointCopy = lastCheckPoint;
    9365          30 :         ControlFile->time = (pg_time_t) time(NULL);
    9366             : 
    9367             :         /*
    9368             :          * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
    9369             :          * this will have happened already while writing out dirty buffers,
    9370             :          * but not necessarily - e.g. because no buffers were dirtied.  We do
    9371             :          * this because a non-exclusive base backup uses minRecoveryPoint to
    9372             :          * determine which WAL files must be included in the backup, and the
    9373             :          * file (or files) containing the checkpoint record must be included,
    9374             :          * at a minimum. Note that for an ordinary restart of recovery there's
    9375             :          * no value in having the minimum recovery point any earlier than this
    9376             :          * anyway, because redo will begin just after the checkpoint record.
    9377             :          */
    9378          30 :         if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
    9379             :         {
    9380           8 :             ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
    9381           8 :             ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
    9382             : 
    9383             :             /* update local copy */
    9384           8 :             minRecoveryPoint = ControlFile->minRecoveryPoint;
    9385           8 :             minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    9386             :         }
    9387          30 :         if (flags & CHECKPOINT_IS_SHUTDOWN)
    9388           8 :             ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    9389          30 :         UpdateControlFile();
    9390             :     }
    9391          30 :     LWLockRelease(ControlFileLock);
    9392             : 
    9393             :     /*
    9394             :      * Update the average distance between checkpoints/restartpoints if the
    9395             :      * prior checkpoint exists.
    9396             :      */
    9397          30 :     if (PriorRedoPtr != InvalidXLogRecPtr)
    9398          30 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    9399             : 
    9400             :     /*
    9401             :      * Delete old log files, those no longer needed for last restartpoint to
    9402             :      * prevent the disk holding the xlog from growing full.
    9403             :      */
    9404          30 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    9405             : 
    9406             :     /*
    9407             :      * Retreat _logSegNo using the current end of xlog replayed or received,
    9408             :      * whichever is later.
    9409             :      */
    9410          30 :     receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
    9411          30 :     replayPtr = GetXLogReplayRecPtr(&replayTLI);
    9412          30 :     endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
    9413          30 :     KeepLogSeg(endptr, &_logSegNo);
    9414          30 :     InvalidateObsoleteReplicationSlots(_logSegNo);
    9415          30 :     _logSegNo--;
    9416             : 
    9417             :     /*
    9418             :      * Try to recycle segments on a useful timeline. If we've been promoted
    9419             :      * since the beginning of this restartpoint, use the new timeline chosen
    9420             :      * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
    9421             :      * case). If we're still in recovery, use the timeline we're currently
    9422             :      * replaying.
    9423             :      *
    9424             :      * There is no guarantee that the WAL segments will be useful on the
    9425             :      * current timeline; if recovery proceeds to a new timeline right after
    9426             :      * this, the pre-allocated WAL segments on this timeline will not be used,
    9427             :      * and will go wasted until recycled on the next restartpoint. We'll live
    9428             :      * with that.
    9429             :      */
    9430          30 :     if (RecoveryInProgress())
    9431          30 :         ThisTimeLineID = replayTLI;
    9432             : 
    9433          30 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
    9434             : 
    9435             :     /*
    9436             :      * Make more log segments if needed.  (Do this after recycling old log
    9437             :      * segments, since that may supply some of the needed files.)
    9438             :      */
    9439          30 :     PreallocXlogFiles(endptr);
    9440             : 
    9441             :     /*
    9442             :      * ThisTimeLineID is normally not set when we're still in recovery.
    9443             :      * However, recycling/preallocating segments above needed ThisTimeLineID
    9444             :      * to determine which timeline to install the segments on. Reset it now,
    9445             :      * to restore the normal state of affairs for debugging purposes.
    9446             :      */
    9447          30 :     if (RecoveryInProgress())
    9448          30 :         ThisTimeLineID = 0;
    9449             : 
    9450             :     /*
    9451             :      * Truncate pg_subtrans if possible.  We can throw away all data before
    9452             :      * the oldest XMIN of any running transaction.  No future transaction will
    9453             :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    9454             :      * in subtrans.c).  When hot standby is disabled, though, we mustn't do
    9455             :      * this because StartupSUBTRANS hasn't been called yet.
    9456             :      */
    9457          30 :     if (EnableHotStandby)
    9458          30 :         TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
    9459             : 
    9460             :     /* Real work is done, but log and update before releasing lock. */
    9461          30 :     LogCheckpointEnd(true);
    9462             : 
    9463          30 :     xtime = GetLatestXTime();
    9464          30 :     ereport((log_checkpoints ? LOG : DEBUG2),
    9465             :             (errmsg("recovery restart point at %X/%X",
    9466             :                     (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
    9467             :              xtime ? errdetail("Last completed transaction was at log time %s.",
    9468             :                                timestamptz_to_str(xtime)) : 0));
    9469             : 
    9470          30 :     LWLockRelease(CheckpointLock);
    9471             : 
    9472             :     /*
    9473             :      * Finally, execute archive_cleanup_command, if any.
    9474             :      */
    9475          30 :     if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
    9476           0 :         ExecuteRecoveryCommand(archiveCleanupCommand,
    9477             :                                "archive_cleanup_command",
    9478             :                                false);
    9479             : 
    9480          30 :     return true;
    9481             : }
    9482             : 
    9483             : /*
    9484             :  * Report availability of WAL for the given target LSN
    9485             :  *      (typically a slot's restart_lsn)
    9486             :  *
    9487             :  * Returns one of the following enum values:
    9488             :  * * WALAVAIL_NORMAL means targetLSN is available because it is in the range
    9489             :  *   of max_wal_size.
    9490             :  *
    9491             :  * * WALAVAIL_PRESERVED means it is still available by preserving extra
    9492             :  *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
    9493             :  *   than max_wal_size, this state is not returned.
    9494             :  *
    9495             :  * * WALAVAIL_REMOVED means it is definitely lost. A replication stream on
    9496             :  *   a slot with this LSN cannot continue.
    9497             :  *
    9498             :  * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
    9499             :  */
    9500             : WALAvailability
    9501         182 : GetWALAvailability(XLogRecPtr targetLSN)
    9502             : {
    9503             :     XLogRecPtr  currpos;        /* current write LSN */
    9504             :     XLogSegNo   currSeg;        /* segid of currpos */
    9505             :     XLogSegNo   targetSeg;      /* segid of targetLSN */
    9506             :     XLogSegNo   oldestSeg;      /* actual oldest segid */
    9507             :     XLogSegNo   oldestSegMaxWalSize;    /* oldest segid kept by max_wal_size */
    9508         182 :     XLogSegNo   oldestSlotSeg = InvalidXLogRecPtr;  /* oldest segid kept by
    9509             :                                                      * slot */
    9510             :     uint64      keepSegs;
    9511             : 
    9512             :     /* slot does not reserve WAL. Either deactivated, or has never been active */
    9513         182 :     if (XLogRecPtrIsInvalid(targetLSN))
    9514          10 :         return WALAVAIL_INVALID_LSN;
    9515             : 
    9516         172 :     currpos = GetXLogWriteRecPtr();
    9517             : 
    9518             :     /* calculate oldest segment currently needed by slots */
    9519         172 :     XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
    9520         172 :     KeepLogSeg(currpos, &oldestSlotSeg);
    9521             : 
    9522             :     /*
    9523             :      * Find the oldest extant segment file. We get 1 until checkpoint removes
    9524             :      * the first WAL segment file since startup, which causes the status being
    9525             :      * wrong under certain abnormal conditions but that doesn't actually harm.
    9526             :      */
    9527         172 :     oldestSeg = XLogGetLastRemovedSegno() + 1;
    9528             : 
    9529             :     /* calculate oldest segment by max_wal_size and wal_keep_segments */
    9530         172 :     XLByteToSeg(currpos, currSeg, wal_segment_size);
    9531         344 :     keepSegs = ConvertToXSegs(Max(max_wal_size_mb, wal_keep_segments),
    9532         172 :                               wal_segment_size) + 1;
    9533             : 
    9534         172 :     if (currSeg > keepSegs)
    9535          16 :         oldestSegMaxWalSize = currSeg - keepSegs;
    9536             :     else
    9537         156 :         oldestSegMaxWalSize = 1;
    9538             : 
    9539             :     /*
    9540             :      * If max_slot_wal_keep_size has changed after the last call, the segment
    9541             :      * that would been kept by the current setting might have been lost by the
    9542             :      * previous setting. No point in showing normal or keeping status values
    9543             :      * if the targetSeg is known to be lost.
    9544             :      */
    9545         172 :     if (targetSeg >= oldestSeg)
    9546             :     {
    9547             :         /*
    9548             :          * show "normal" when targetSeg is within max_wal_size, even if
    9549             :          * max_slot_wal_keep_size is smaller than max_wal_size.
    9550             :          */
    9551         172 :         if ((max_slot_wal_keep_size_mb <= 0 ||
    9552         172 :              max_slot_wal_keep_size_mb >= max_wal_size_mb) &&
    9553             :             oldestSegMaxWalSize <= targetSeg)
    9554         168 :             return WALAVAIL_NORMAL;
    9555             : 
    9556             :         /* being retained by slots */
    9557           4 :         if (oldestSlotSeg <= targetSeg)
    9558           2 :             return WALAVAIL_RESERVED;
    9559             :     }
    9560             : 
    9561             :     /* Definitely lost */
    9562           2 :     return WALAVAIL_REMOVED;
    9563             : }
    9564             : 
    9565             : 
    9566             : /*
    9567             :  * Retreat *logSegNo to the last segment that we need to retain because of
    9568             :  * either wal_keep_segments or replication slots.
    9569             :  *
    9570             :  * This is calculated by subtracting wal_keep_segments from the given xlog
    9571             :  * location, recptr and by making sure that that result is below the
    9572             :  * requirement of replication slots.  For the latter criterion we do consider
    9573             :  * the effects of max_slot_wal_keep_size: reserve at most that much space back
    9574             :  * from recptr.
    9575             :  */
    9576             : static void
    9577        3344 : KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
    9578             : {
    9579             :     XLogSegNo   currSegNo;
    9580             :     XLogSegNo   segno;
    9581             :     XLogRecPtr  keep;
    9582             : 
    9583        3344 :     XLByteToSeg(recptr, currSegNo, wal_segment_size);
    9584        3344 :     segno = currSegNo;
    9585             : 
    9586             :     /*
    9587             :      * Calculate how many segments are kept by slots first, adjusting for
    9588             :      * max_slot_wal_keep_size.
    9589             :      */
    9590        3344 :     keep = XLogGetReplicationSlotMinimumLSN();
    9591        3344 :     if (keep != InvalidXLogRecPtr)
    9592             :     {
    9593         238 :         XLByteToSeg(keep, segno, wal_segment_size);
    9594             : 
    9595             :         /* Cap by max_slot_wal_keep_size ... */
    9596         238 :         if (max_slot_wal_keep_size_mb >= 0)
    9597             :         {
    9598             :             XLogRecPtr  slot_keep_segs;
    9599             : 
    9600          10 :             slot_keep_segs =
    9601          10 :                 ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
    9602             : 
    9603          10 :             if (currSegNo - segno > slot_keep_segs)
    9604           2 :                 segno = currSegNo - slot_keep_segs;
    9605             :         }
    9606             :     }
    9607             : 
    9608             :     /* but, keep at least wal_keep_segments if that's set */
    9609        3344 :     if (wal_keep_segments > 0 && currSegNo - segno < wal_keep_segments)
    9610             :     {
    9611             :         /* avoid underflow, don't go below 1 */
    9612          90 :         if (currSegNo <= wal_keep_segments)
    9613          88 :             segno = 1;
    9614             :         else
    9615           2 :             segno = currSegNo - wal_keep_segments;
    9616             :     }
    9617             : 
    9618             :     /* don't delete WAL segments newer than the calculated segment */
    9619        3344 :     if (XLogRecPtrIsInvalid(*logSegNo) || segno < *logSegNo)
    9620         274 :         *logSegNo = segno;
    9621        3344 : }
    9622             : 
    9623             : /*
    9624             :  * Write a NEXTOID log record
    9625             :  */
    9626             : void
    9627        1140 : XLogPutNextOid(Oid nextOid)
    9628             : {
    9629        1140 :     XLogBeginInsert();
    9630        1140 :     XLogRegisterData((char *) (&nextOid), sizeof(Oid));
    9631        1140 :     (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
    9632             : 
    9633             :     /*
    9634             :      * We need not flush the NEXTOID record immediately, because any of the
    9635             :      * just-allocated OIDs could only reach disk as part of a tuple insert or
    9636             :      * update that would have its own XLOG record that must follow the NEXTOID
    9637             :      * record.  Therefore, the standard buffer LSN interlock applied to those
    9638             :      * records will ensure no such OID reaches disk before the NEXTOID record
    9639             :      * does.
    9640             :      *
    9641             :      * Note, however, that the above statement only covers state "within" the
    9642             :      * database.  When we use a generated OID as a file or directory name, we
    9643             :      * are in a sense violating the basic WAL rule, because that filesystem
    9644             :      * change may reach disk before the NEXTOID WAL record does.  The impact
    9645             :      * of this is that if a database crash occurs immediately afterward, we
    9646             :      * might after restart re-generate the same OID and find that it conflicts
    9647             :      * with the leftover file or directory.  But since for safety's sake we
    9648             :      * always loop until finding a nonconflicting filename, this poses no real
    9649             :      * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
    9650             :      */
    9651        1140 : }
    9652             : 
    9653             : /*
    9654             :  * Write an XLOG SWITCH record.
    9655             :  *
    9656             :  * Here we just blindly issue an XLogInsert request for the record.
    9657             :  * All the magic happens inside XLogInsert.
    9658             :  *
    9659             :  * The return value is either the end+1 address of the switch record,
    9660             :  * or the end+1 address of the prior segment if we did not need to
    9661             :  * write a switch record because we are already at segment start.
    9662             :  */
    9663             : XLogRecPtr
    9664         344 : RequestXLogSwitch(bool mark_unimportant)
    9665             : {
    9666             :     XLogRecPtr  RecPtr;
    9667             : 
    9668             :     /* XLOG SWITCH has no data */
    9669         344 :     XLogBeginInsert();
    9670             : 
    9671         344 :     if (mark_unimportant)
    9672           0 :         XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    9673         344 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
    9674             : 
    9675         344 :     return RecPtr;
    9676             : }
    9677             : 
    9678             : /*
    9679             :  * Write a RESTORE POINT record
    9680             :  */
    9681             : XLogRecPtr
    9682           2 : XLogRestorePoint(const char *rpName)
    9683             : {
    9684             :     XLogRecPtr  RecPtr;
    9685             :     xl_restore_point xlrec;
    9686             : 
    9687           2 :     xlrec.rp_time = GetCurrentTimestamp();
    9688           2 :     strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
    9689             : 
    9690           2 :     XLogBeginInsert();
    9691           2 :     XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
    9692             : 
    9693           2 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
    9694             : 
    9695           2 :     ereport(LOG,
    9696             :             (errmsg("restore point \"%s\" created at %X/%X",
    9697             :                     rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
    9698             : 
    9699           2 :     return RecPtr;
    9700             : }
    9701             : 
    9702             : /*
    9703             :  * Check if any of the GUC parameters that are critical for hot standby
    9704             :  * have changed, and update the value in pg_control file if necessary.
    9705             :  */
    9706             : static void
    9707        1356 : XLogReportParameters(void)
    9708             : {
    9709        1356 :     if (wal_level != ControlFile->wal_level ||
    9710        1210 :         wal_log_hints != ControlFile->wal_log_hints ||
    9711        1138 :         MaxConnections != ControlFile->MaxConnections ||
    9712        1136 :         max_worker_processes != ControlFile->max_worker_processes ||
    9713        1136 :         max_wal_senders != ControlFile->max_wal_senders ||
    9714        1136 :         max_prepared_xacts != ControlFile->max_prepared_xacts ||
    9715        1008 :         max_locks_per_xact != ControlFile->max_locks_per_xact ||
    9716        1008 :         track_commit_timestamp != ControlFile->track_commit_timestamp)
    9717             :     {
    9718             :         /*
    9719             :          * The change in number of backend slots doesn't need to be WAL-logged
    9720             :          * if archiving is not enabled, as you can't start archive recovery
    9721             :          * with wal_level=minimal anyway. We don't really care about the
    9722             :          * values in pg_control either if wal_level=minimal, but seems better
    9723             :          * to keep them up-to-date to avoid confusion.
    9724             :          */
    9725         360 :         if (wal_level != ControlFile->wal_level || XLogIsNeeded())
    9726             :         {
    9727             :             xl_parameter_change xlrec;
    9728             :             XLogRecPtr  recptr;
    9729             : 
    9730         360 :             xlrec.MaxConnections = MaxConnections;
    9731         360 :             xlrec.max_worker_processes = max_worker_processes;
    9732         360 :             xlrec.max_wal_senders = max_wal_senders;
    9733         360 :             xlrec.max_prepared_xacts = max_prepared_xacts;
    9734         360 :             xlrec.max_locks_per_xact = max_locks_per_xact;
    9735         360 :             xlrec.wal_level = wal_level;
    9736         360 :             xlrec.wal_log_hints = wal_log_hints;
    9737         360 :             xlrec.track_commit_timestamp = track_commit_timestamp;
    9738             : 
    9739         360 :             XLogBeginInsert();
    9740         360 :             XLogRegisterData((char *) &xlrec, sizeof(xlrec));
    9741             : 
    9742         360 :             recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
    9743         360 :             XLogFlush(recptr);
    9744             :         }
    9745             : 
    9746         360 :         ControlFile->MaxConnections = MaxConnections;
    9747         360 :         ControlFile->max_worker_processes = max_worker_processes;
    9748         360 :         ControlFile->max_wal_senders = max_wal_senders;
    9749         360 :         ControlFile->max_prepared_xacts = max_prepared_xacts;
    9750         360 :         ControlFile->max_locks_per_xact = max_locks_per_xact;
    9751         360 :         ControlFile->wal_level = wal_level;
    9752         360 :         ControlFile->wal_log_hints = wal_log_hints;
    9753         360 :         ControlFile->track_commit_timestamp = track_commit_timestamp;
    9754         360 :         UpdateControlFile();
    9755             :     }
    9756        1356 : }
    9757             : 
    9758             : /*
    9759             :  * Update full_page_writes in shared memory, and write an
    9760             :  * XLOG_FPW_CHANGE record if necessary.
    9761             :  *
    9762             :  * Note: this function assumes there is no other process running
    9763             :  * concurrently that could update it.
    9764             :  */
    9765             : void
    9766        1778 : UpdateFullPageWrites(void)
    9767             : {
    9768        1778 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    9769             :     bool        recoveryInProgress;
    9770             : 
    9771             :     /*
    9772             :      * Do nothing if full_page_writes has not been changed.
    9773             :      *
    9774             :      * It's safe to check the shared full_page_writes without the lock,
    9775             :      * because we assume that there is no concurrently running process which
    9776             :      * can update it.
    9777             :      */
    9778        1778 :     if (fullPageWrites == Insert->fullPageWrites)
    9779        1722 :         return;
    9780             : 
    9781             :     /*
    9782             :      * Perform this outside critical section so that the WAL insert
    9783             :      * initialization done by RecoveryInProgress() doesn't trigger an
    9784             :      * assertion failure.
    9785             :      */
    9786          56 :     recoveryInProgress = RecoveryInProgress();
    9787             : 
    9788          56 :     START_CRIT_SECTION();
    9789             : 
    9790             :     /*
    9791             :      * It's always safe to take full page images, even when not strictly
    9792             :      * required, but not the other round. So if we're setting full_page_writes
    9793             :      * to true, first set it true and then write the WAL record. If we're
    9794             :      * setting it to false, first write the WAL record and then set the global
    9795             :      * flag.
    9796             :      */
    9797          56 :     if (fullPageWrites)
    9798             :     {
    9799          54 :         WALInsertLockAcquireExclusive();
    9800          54 :         Insert->fullPageWrites = true;
    9801          54 :         WALInsertLockRelease();
    9802             :     }
    9803             : 
    9804             :     /*
    9805             :      * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
    9806             :      * full_page_writes during archive recovery, if required.
    9807             :      */
    9808          56 :     if (XLogStandbyInfoActive() && !recoveryInProgress)
    9809             :     {
    9810           0 :         XLogBeginInsert();
    9811           0 :         XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
    9812             : 
    9813           0 :         XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
    9814             :     }
    9815             : 
    9816          56 :     if (!fullPageWrites)
    9817             :     {
    9818           2 :         WALInsertLockAcquireExclusive();
    9819           2 :         Insert->fullPageWrites = false;
    9820           2 :         WALInsertLockRelease();
    9821             :     }
    9822          56 :     END_CRIT_SECTION();
    9823             : }
    9824             : 
    9825             : /*
    9826             :  * Check that it's OK to switch to new timeline during recovery.
    9827             :  *
    9828             :  * 'lsn' is the address of the shutdown checkpoint record we're about to
    9829             :  * replay. (Currently, timeline can only change at a shutdown checkpoint).
    9830             :  */
    9831             : static void
    9832          10 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
    9833             : {
    9834             :     /* Check that the record agrees on what the current (old) timeline is */
    9835          10 :     if (prevTLI != ThisTimeLineID)
    9836           0 :         ereport(PANIC,
    9837             :                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
    9838             :                         prevTLI, ThisTimeLineID)));
    9839             : 
    9840             :     /*
    9841             :      * The new timeline better be in the list of timelines we expect to see,
    9842             :      * according to the timeline history. It should also not decrease.
    9843             :      */
    9844          10 :     if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
    9845           0 :         ereport(PANIC,
    9846             :                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
    9847             :                         newTLI, ThisTimeLineID)));
    9848             : 
    9849             :     /*
    9850             :      * If we have not yet reached min recovery point, and we're about to
    9851             :      * switch to a timeline greater than the timeline of the min recovery
    9852             :      * point: trouble. After switching to the new timeline, we could not
    9853             :      * possibly visit the min recovery point on the correct timeline anymore.
    9854             :      * This can happen if there is a newer timeline in the archive that
    9855             :      * branched before the timeline the min recovery point is on, and you
    9856             :      * attempt to do PITR to the new timeline.
    9857             :      */
    9858          10 :     if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
    9859           8 :         lsn < minRecoveryPoint &&
    9860           0 :         newTLI > minRecoveryPointTLI)
    9861           0 :         ereport(PANIC,
    9862             :                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
    9863             :                         newTLI,
    9864             :                         (uint32) (minRecoveryPoint >> 32),
    9865             :                         (uint32) minRecoveryPoint,
    9866             :                         minRecoveryPointTLI)));
    9867             : 
    9868             :     /* Looks good */
    9869          10 : }
    9870             : 
    9871             : /*
    9872             :  * XLOG resource manager's routines
    9873             :  *
    9874             :  * Definitions of info values are in include/catalog/pg_control.h, though
    9875             :  * not all record types are related to control file updates.
    9876             :  */
    9877             : void
    9878        3040 : xlog_redo(XLogReaderState *record)
    9879             : {
    9880        3040 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    9881        3040 :     XLogRecPtr  lsn = record->EndRecPtr;
    9882             : 
    9883             :     /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
    9884             :     Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
    9885             :            !XLogRecHasAnyBlockRefs(record));
    9886             : 
    9887        3040 :     if (info == XLOG_NEXTOID)
    9888             :     {
    9889             :         Oid         nextOid;
    9890             : 
    9891             :         /*
    9892             :          * We used to try to take the maximum of ShmemVariableCache->nextOid
    9893             :          * and the recorded nextOid, but that fails if the OID counter wraps
    9894             :          * around.  Since no OID allocation should be happening during replay
    9895             :          * anyway, better to just believe the record exactly.  We still take
    9896             :          * OidGenLock while setting the variable, just in case.
    9897             :          */
    9898         100 :         memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
    9899         100 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    9900         100 :         ShmemVariableCache->nextOid = nextOid;
    9901         100 :         ShmemVariableCache->oidCount = 0;
    9902         100 :         LWLockRelease(OidGenLock);
    9903             :     }
    9904        2940 :     else if (info == XLOG_CHECKPOINT_SHUTDOWN)
    9905             :     {
    9906             :         CheckPoint  checkPoint;
    9907             : 
    9908          30 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    9909             :         /* In a SHUTDOWN checkpoint, believe the counters exactly */
    9910          30 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    9911          30 :         ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
    9912          30 :         LWLockRelease(XidGenLock);
    9913          30 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    9914          30 :         ShmemVariableCache->nextOid = checkPoint.nextOid;
    9915          30 :         ShmemVariableCache->oidCount = 0;
    9916          30 :         LWLockRelease(OidGenLock);
    9917          30 :         MultiXactSetNextMXact(checkPoint.nextMulti,
    9918             :                               checkPoint.nextMultiOffset);
    9919             : 
    9920          30 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
    9921             :                                checkPoint.oldestMultiDB);
    9922             : 
    9923             :         /*
    9924             :          * No need to set oldestClogXid here as well; it'll be set when we
    9925             :          * redo an xl_clog_truncate if it changed since initialization.
    9926             :          */
    9927          30 :         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    9928             : 
    9929             :         /*
    9930             :          * If we see a shutdown checkpoint while waiting for an end-of-backup
    9931             :          * record, the backup was canceled and the end-of-backup record will
    9932             :          * never arrive.
    9933             :          */
    9934          30 :         if (ArchiveRecoveryRequested &&
    9935          30 :             !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
    9936           0 :             XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
    9937           0 :             ereport(PANIC,
    9938             :                     (errmsg("online backup was canceled, recovery cannot continue")));
    9939             : 
    9940             :         /*
    9941             :          * If we see a shutdown checkpoint, we know that nothing was running
    9942             :          * on the master at this point. So fake-up an empty running-xacts
    9943             :          * record and use that here and now. Recover additional standby state
    9944             :          * for prepared transactions.
    9945             :          */
    9946          30 :         if (standbyState >= STANDBY_INITIALIZED)
    9947             :         {
    9948             :             TransactionId *xids;
    9949             :             int         nxids;
    9950             :             TransactionId oldestActiveXID;
    9951             :             TransactionId latestCompletedXid;
    9952             :             RunningTransactionsData running;
    9953             : 
    9954          30 :             oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    9955             : 
    9956             :             /*
    9957             :              * Construct a RunningTransactions snapshot representing a shut
    9958             :              * down server, with only prepared transactions still alive. We're
    9959             :              * never overflowed at this point because all subxids are listed
    9960             :              * with their parent prepared transactions.
    9961             :              */
    9962          30 :             running.xcnt = nxids;
    9963          30 :             running.subxcnt = 0;
    9964          30 :             running.subxid_overflow = false;
    9965          30 :             running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
    9966          30 :             running.oldestRunningXid = oldestActiveXID;
    9967          30 :             latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
    9968          30 :             TransactionIdRetreat(latestCompletedXid);
    9969             :             Assert(TransactionIdIsNormal(latestCompletedXid));
    9970          30 :             running.latestCompletedXid = latestCompletedXid;
    9971          30 :             running.xids = xids;
    9972             : 
    9973          30 :             ProcArrayApplyRecoveryInfo(&running);
    9974             : 
    9975          30 :             StandbyRecoverPreparedTransactions();
    9976             :         }
    9977             : 
    9978             :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    9979          30 :         ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
    9980             : 
    9981             :         /* Update shared-memory copy of checkpoint XID/epoch */
    9982          30 :         SpinLockAcquire(&XLogCtl->info_lck);
    9983          30 :         XLogCtl->ckptFullXid = checkPoint.nextFullXid;
    9984          30 :         SpinLockRelease(&XLogCtl->info_lck);
    9985             : 
    9986             :         /*
    9987             :          * We should've already switched to the new TLI before replaying this
    9988             :          * record.
    9989             :          */
    9990          30 :         if (checkPoint.ThisTimeLineID != ThisTimeLineID)
    9991           0 :             ereport(PANIC,
    9992             :                     (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
    9993             :                             checkPoint.ThisTimeLineID, ThisTimeLineID)));
    9994             : 
    9995          30 :         RecoveryRestartPoint(&checkPoint);
    9996             :     }
    9997        2910 :     else if (info == XLOG_CHECKPOINT_ONLINE)
    9998             :     {
    9999             :         CheckPoint  checkPoint;
   10000             : 
   10001         172 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
   10002             :         /* In an ONLINE checkpoint, treat the XID counter as a minimum */
   10003         172 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
   10004         172 :         if (FullTransactionIdPrecedes(ShmemVariableCache->nextFullXid,
   10005             :                                       checkPoint.nextFullXid))
   10006           0 :             ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
   10007         172 :         LWLockRelease(XidGenLock);
   10008             : 
   10009             :         /*
   10010             :          * We ignore the nextOid counter in an ONLINE checkpoint, preferring
   10011             :          * to track OID assignment through XLOG_NEXTOID records.  The nextOid
   10012             :          * counter is from the start of the checkpoint and might well be stale
   10013             :          * compared to later XLOG_NEXTOID records.  We could try to take the
   10014             :          * maximum of the nextOid counter and our latest value, but since
   10015             :          * there's no particular guarantee about the speed with which the OID
   10016             :          * counter wraps around, that's a risky thing to do.  In any case,
   10017             :          * users of the nextOid counter are required to avoid assignment of
   10018             :          * duplicates, so that a somewhat out-of-date value should be safe.
   10019             :          */
   10020             : 
   10021             :         /* Handle multixact */
   10022         172 :         MultiXactAdvanceNextMXact(checkPoint.nextMulti,
   10023             :                                   checkPoint.nextMultiOffset);
   10024             : 
   10025             :         /*
   10026             :          * NB: This may perform multixact truncation when replaying WAL
   10027             :          * generated by an older primary.
   10028             :          */
   10029         172 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
   10030             :                                checkPoint.oldestMultiDB);
   10031         172 :         if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
   10032             :                                   checkPoint.oldestXid))
   10033           0 :             SetTransactionIdLimit(checkPoint.oldestXid,
   10034             :                                   checkPoint.oldestXidDB);
   10035             :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
   10036         172 :         ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
   10037             : 
   10038             :         /* Update shared-memory copy of checkpoint XID/epoch */
   10039         172 :         SpinLockAcquire(&XLogCtl->info_lck);
   10040         172 :         XLogCtl->ckptFullXid = checkPoint.nextFullXid;
   10041         172 :         SpinLockRelease(&XLogCtl->info_lck);
   10042             : 
   10043             :         /* TLI should not change in an on-line checkpoint */
   10044         172 :         if (checkPoint.ThisTimeLineID != ThisTimeLineID)
   10045           0 :             ereport(PANIC,
   10046             :                     (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
   10047             :                             checkPoint.ThisTimeLineID, ThisTimeLineID)));
   10048             : 
   10049         172 :         RecoveryRestartPoint(&checkPoint);
   10050             :     }
   10051        2738 :     else if (info == XLOG_END_OF_RECOVERY)
   10052             :     {
   10053             :         xl_end_of_recovery xlrec;
   10054             : 
   10055          10 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
   10056             : 
   10057             :         /*
   10058             :          * For Hot Standby, we could treat this like a Shutdown Checkpoint,
   10059             :          * but this case is rarer and harder to test, so the benefit doesn't
   10060             :          * outweigh the potential extra cost of maintenance.
   10061             :          */
   10062             : 
   10063             :         /*
   10064             :          * We should've already switched to the new TLI before replaying this
   10065             :          * record.
   10066             :          */
   10067          10 :         if (xlrec.ThisTimeLineID != ThisTimeLineID)
   10068           0 :             ereport(PANIC,
   10069             :                     (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
   10070             :                             xlrec.ThisTimeLineID, ThisTimeLineID)));
   10071             :     }
   10072        2728 :     else if (info == XLOG_NOOP)
   10073             :     {
   10074             :         /* nothing to do here */
   10075             :     }
   10076        2728 :     else if (info == XLOG_SWITCH)
   10077             :     {
   10078             :         /* nothing to do here */
   10079             :     }
   10080        2612 :     else if (info == XLOG_RESTORE_POINT)
   10081             :     {
   10082             :         /* nothing to do here */
   10083             :     }
   10084        2610 :     else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
   10085             :     {
   10086             :         /*
   10087             :          * Full-page image (FPI) records contain nothing else but a backup
   10088             :          * block (or multiple backup blocks). Every block reference must
   10089             :          * include a full-page image - otherwise there would be no point in
   10090             :          * this record.
   10091             :          *
   10092             :          * No recovery conflicts are generated by these generic records - if a
   10093             :          * resource manager needs to generate conflicts, it has to define a
   10094             :          * separate WAL record type and redo routine.
   10095             :          *
   10096             :          * XLOG_FPI_FOR_HINT records are generated when a page needs to be
   10097             :          * WAL- logged because of a hint bit update. They are only generated
   10098             :          * when checksums are enabled. There is no difference in handling
   10099             :          * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
   10100             :          * code just to distinguish them for statistics purposes.
   10101             :          */
   10102        5334 :         for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
   10103             :         {
   10104             :             Buffer      buffer;
   10105             : 
   10106        2810 :             if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
   10107           0 :                 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
   10108        2810 :             UnlockReleaseBuffer(buffer);
   10109             :         }
   10110             :     }
   10111          86 :     else if (info == XLOG_BACKUP_END)
   10112             :     {
   10113             :         XLogRecPtr  startpoint;
   10114             : 
   10115          68 :         memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
   10116             : 
   10117          68 :         if (ControlFile->backupStartPoint == startpoint)
   10118             :         {
   10119             :             /*
   10120             :              * We have reached the end of base backup, the point where
   10121             :              * pg_stop_backup() was done. The data on disk is now consistent.
   10122             :              * Reset backupStartPoint, and update minRecoveryPoint to make
   10123             :              * sure we don't allow starting up at an earlier point even if
   10124             :              * recovery is stopped and restarted soon after this.
   10125             :              */
   10126          58 :             elog(DEBUG1, "end of backup reached");
   10127             : 
   10128          58 :             LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
   10129             : 
   10130          58 :             if (ControlFile->minRecoveryPoint < lsn)
   10131             :             {
   10132          58 :                 ControlFile->minRecoveryPoint = lsn;
   10133          58 :                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
   10134             :             }
   10135          58 :             ControlFile->backupStartPoint = InvalidXLogRecPtr;
   10136          58 :             ControlFile->backupEndRequired = false;
   10137          58 :             UpdateControlFile();
   10138             : 
   10139          58 :             LWLockRelease(ControlFileLock);
   10140             :         }
   10141             :     }
   10142          18 :     else if (info == XLOG_PARAMETER_CHANGE)
   10143             :     {
   10144             :         xl_parameter_change xlrec;
   10145             : 
   10146             :         /* Update our copy of the parameters in pg_control */
   10147          18 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
   10148             : 
   10149          18 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
   10150          18 :         ControlFile->MaxConnections = xlrec.MaxConnections;
   10151          18 :         ControlFile->max_worker_processes = xlrec.max_worker_processes;
   10152          18 :         ControlFile->max_wal_senders = xlrec.max_wal_senders;
   10153          18 :         ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
   10154          18 :         ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
   10155          18 :         ControlFile->wal_level = xlrec.wal_level;
   10156          18 :         ControlFile->wal_log_hints = xlrec.wal_log_hints;
   10157             : 
   10158             :         /*
   10159             :          * Update minRecoveryPoint to ensure that if recovery is aborted, we
   10160             :          * recover back up to this point before allowing hot standby again.
   10161             :          * This is important if the max_* settings are decreased, to ensure
   10162             :          * you don't run queries against the WAL preceding the change. The
   10163             :          * local copies cannot be updated as long as crash recovery is
   10164             :          * happening and we expect all the WAL to be replayed.
   10165             :          */
   10166          18 :         if (InArchiveRecovery)
   10167             :         {
   10168           4 :             minRecoveryPoint = ControlFile->minRecoveryPoint;
   10169           4 :             minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
   10170             :         }
   10171          18 :         if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
   10172             :         {
   10173           4 :             ControlFile->minRecoveryPoint = lsn;
   10174           4 :             ControlFile->minRecoveryPointTLI = ThisTimeLineID;
   10175             :         }
   10176             : 
   10177          18 :         CommitTsParameterChange(xlrec.track_commit_timestamp,
   10178          18 :                                 ControlFile->track_commit_timestamp);
   10179          18 :         ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
   10180             : 
   10181          18 :         UpdateControlFile();
   10182          18 :         LWLockRelease(ControlFileLock);
   10183             : 
   10184             :         /* Check to see if any parameter change gives a problem on recovery */
   10185          18 :         CheckRequiredParameterValues();
   10186             :     }
   10187           0 :     else if (info == XLOG_FPW_CHANGE)
   10188             :     {
   10189             :         bool        fpw;
   10190             : 
   10191           0 :         memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
   10192             : 
   10193             :         /*
   10194             :          * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
   10195             :          * do_pg_start_backup() and do_pg_stop_backup() can check whether
   10196             :          * full_page_writes has been disabled during online backup.
   10197             :          */
   10198           0 :         if (!fpw)
   10199             :         {
   10200           0 :             SpinLockAcquire(&XLogCtl->info_lck);
   10201           0 :             if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
   10202           0 :                 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
   10203           0 :             SpinLockRelease(&XLogCtl->info_lck);
   10204             :         }
   10205             : 
   10206             :         /* Keep track of full_page_writes */
   10207           0 :         lastFullPageWrites = fpw;
   10208             :     }
   10209        3040 : }
   10210             : 
   10211             : #ifdef WAL_DEBUG
   10212             : 
   10213             : static void
   10214             : xlog_outrec(StringInfo buf, XLogReaderState *record)
   10215             : {
   10216             :     int         block_id;
   10217             : 
   10218             :     appendStringInfo(buf, "prev %X/%X; xid %u",
   10219             :                      (uint32) (XLogRecGetPrev(record) >> 32),
   10220             :                      (uint32) XLogRecGetPrev(record),
   10221             :                      XLogRecGetXid(record));
   10222             : 
   10223             :     appendStringInfo(buf, "; len %u",
   10224             :                      XLogRecGetDataLen(record));
   10225             : 
   10226             :     /* decode block references */
   10227             :     for (block_id = 0; block_id <= record->max_block_id; block_id++)
   10228             :     {
   10229             :         RelFileNode rnode;
   10230             :         ForkNumber  forknum;
   10231             :         BlockNumber blk;
   10232             : 
   10233             :         if (!XLogRecHasBlockRef(record, block_id))
   10234             :             continue;
   10235             : 
   10236             :         XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
   10237             :         if (forknum != MAIN_FORKNUM)
   10238             :             appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
   10239             :                              block_id,
   10240             :                              rnode.spcNode, rnode.dbNode, rnode.relNode,
   10241             :                              forknum,
   10242             :                              blk);
   10243             :         else
   10244             :             appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
   10245             :                              block_id,
   10246             :                              rnode.spcNode, rnode.dbNode, rnode.relNode,
   10247             :                              blk);
   10248             :         if (XLogRecHasBlockImage(record, block_id))
   10249             :             appendStringInfoString(buf, " FPW");
   10250             :     }
   10251             : }
   10252             : #endif                          /* WAL_DEBUG */
   10253             : 
   10254             : /*
   10255             :  * Returns a string describing an XLogRecord, consisting of its identity
   10256             :  * optionally followed by a colon, a space, and a further description.
   10257             :  */
   10258             : static void
   10259           6 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
   10260             : {
   10261           6 :     RmgrId      rmid = XLogRecGetRmid(record);
   10262           6 :     uint8       info = XLogRecGetInfo(record);
   10263             :     const char *id;
   10264             : 
   10265           6 :     appendStringInfoString(buf, RmgrTable[rmid].rm_name);
   10266           6 :     appendStringInfoChar(buf, '/');
   10267             : 
   10268           6 :     id = RmgrTable[rmid].rm_identify(info);
   10269           6 :     if (id == NULL)
   10270           0 :         appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
   10271             :     else
   10272           6 :         appendStringInfo(buf, "%s: ", id);
   10273             : 
   10274           6 :     RmgrTable[rmid].rm_desc(buf, record);
   10275           6 : }
   10276             : 
   10277             : 
   10278             : /*
   10279             :  * Return the (possible) sync flag used for opening a file, depending on the
   10280             :  * value of the GUC wal_sync_method.
   10281             :  */
   10282             : static int
   10283        9292 : get_sync_bit(int method)
   10284             : {
   10285        9292 :     int         o_direct_flag = 0;
   10286             : 
   10287             :     /* If fsync is disabled, never open in sync mode */
   10288        9292 :     if (!enableFsync)
   10289        9260 :         return 0;
   10290             : 
   10291             :     /*
   10292             :      * Optimize writes by bypassing kernel cache with O_DIRECT when using
   10293             :      * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
   10294             :      * disabled, otherwise the archive command or walsender process will read
   10295             :      * the WAL soon after writing it, which is guaranteed to cause a physical
   10296             :      * read if we bypassed the kernel cache. We also skip the
   10297             :      * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
   10298             :      * reason.
   10299             :      *
   10300             :      * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
   10301             :      * written by walreceiver is normally read by the startup process soon
   10302             :      * after its written. Also, walreceiver performs unaligned writes, which
   10303             :      * don't work with O_DIRECT, so it is required for correctness too.
   10304             :      */
   10305          32 :     if (!XLogIsNeeded() && !AmWalReceiverProcess())
   10306           0 :         o_direct_flag = PG_O_DIRECT;
   10307             : 
   10308          32 :     switch (method)
   10309             :     {
   10310             :             /*
   10311             :              * enum values for all sync options are defined even if they are
   10312             :              * not supported on the current platform.  But if not, they are
   10313             :              * not included in the enum option array, and therefore will never
   10314             :              * be seen here.
   10315             :              */
   10316          32 :         case SYNC_METHOD_FSYNC:
   10317             :         case SYNC_METHOD_FSYNC_WRITETHROUGH:
   10318             :         case SYNC_METHOD_FDATASYNC:
   10319          32 :             return 0;
   10320             : #ifdef OPEN_SYNC_FLAG
   10321           0 :         case SYNC_METHOD_OPEN:
   10322           0 :             return OPEN_SYNC_FLAG | o_direct_flag;
   10323             : #endif
   10324             : #ifdef OPEN_DATASYNC_FLAG
   10325           0 :         case SYNC_METHOD_OPEN_DSYNC:
   10326           0 :             return OPEN_DATASYNC_FLAG | o_direct_flag;
   10327             : #endif
   10328           0 :         default:
   10329             :             /* can't happen (unless we are out of sync with option array) */
   10330           0 :             elog(ERROR, "unrecognized wal_sync_method: %d", method);
   10331             :             return 0;           /* silence warning */
   10332             :     }
   10333             : }
   10334             : 
   10335             : /*
   10336             :  * GUC support
   10337             :  */
   10338             : void
   10339        2184 : assign_xlog_sync_method(int new_sync_method, void *extra)
   10340             : {
   10341        2184 :     if (sync_method != new_sync_method)
   10342             :     {
   10343             :         /*
   10344             :          * To ensure that no blocks escape unsynced, force an fsync on the
   10345             :          * currently open log segment (if any).  Also, if the open flag is
   10346             :          * changing, close the log file so it will be reopened (with new flag
   10347             :          * bit) at next use.
   10348             :          */
   10349           0 :         if (openLogFile >= 0)
   10350             :         {
   10351           0 :             pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
   10352           0 :             if (pg_fsync(openLogFile) != 0)
   10353             :             {
   10354             :                 char        xlogfname[MAXFNAMELEN];
   10355             :                 int         save_errno;
   10356             : 
   10357           0 :                 save_errno = errno;
   10358           0 :                 XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
   10359             :                              wal_segment_size);
   10360           0 :                 errno = save_errno;
   10361           0 :                 ereport(PANIC,
   10362             :                         (errcode_for_file_access(),
   10363             :                          errmsg("could not fsync file \"%s\": %m", xlogfname)));
   10364             :             }
   10365             : 
   10366           0 :             pgstat_report_wait_end();
   10367           0 :             if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
   10368           0 :                 XLogFileClose();
   10369             :         }
   10370             :     }
   10371        2184 : }
   10372             : 
   10373             : 
   10374             : /*
   10375             :  * Issue appropriate kind of fsync (if any) for an XLOG output file.
   10376             :  *
   10377             :  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
   10378             :  * 'segno' is for error reporting purposes.
   10379             :  */
   10380             : void
   10381      265338 : issue_xlog_fsync(int fd, XLogSegNo segno)
   10382             : {
   10383      265338 :     char       *msg = NULL;
   10384             : 
   10385      265338 :     pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
   10386      265338 :     switch (sync_method)
   10387             :     {
   10388           0 :         case SYNC_METHOD_FSYNC:
   10389           0 :             if (pg_fsync_no_writethrough(fd) != 0)
   10390           0 :                 msg = _("could not fsync file \"%s\": %m");
   10391           0 :             break;
   10392             : #ifdef HAVE_FSYNC_WRITETHROUGH
   10393             :         case SYNC_METHOD_FSYNC_WRITETHROUGH:
   10394             :             if (pg_fsync_writethrough(fd) != 0)
   10395             :                 msg = _("could not fsync write-through file \"%s\": %m");
   10396             :             break;
   10397             : #endif
   10398             : #ifdef HAVE_FDATASYNC
   10399      265338 :         case SYNC_METHOD_FDATASYNC:
   10400      265338 :             if (pg_fdatasync(fd) != 0)
   10401           0 :                 msg = _("could not fdatasync file \"%s\": %m");
   10402      265338 :             break;
   10403             : #endif
   10404           0 :         case SYNC_METHOD_OPEN:
   10405             :         case SYNC_METHOD_OPEN_DSYNC:
   10406             :             /* write synced it already */
   10407           0 :             break;
   10408           0 :         default:
   10409           0 :             elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
   10410             :             break;
   10411             :     }
   10412             : 
   10413             :     /* PANIC if failed to fsync */
   10414      265338 :     if (msg)
   10415             :     {
   10416             :         char        xlogfname[MAXFNAMELEN];
   10417           0 :         int         save_errno = errno;
   10418             : 
   10419           0 :         XLogFileName(xlogfname, ThisTimeLineID, segno,
   10420             :                      wal_segment_size);
   10421           0 :         errno = save_errno;
   10422           0 :         ereport(PANIC,
   10423             :                 (errcode_for_file_access(),
   10424             :                  errmsg(msg, xlogfname)));
   10425             :     }
   10426             : 
   10427      265338 :     pgstat_report_wait_end();
   10428      265338 : }
   10429             : 
   10430             : /*
   10431             :  * do_pg_start_backup
   10432             :  *
   10433             :  * Utility function called at the start of an online backup. It creates the
   10434             :  * necessary starting checkpoint and constructs the backup label file.
   10435             :  *
   10436             :  * There are two kind of backups: exclusive and non-exclusive. An exclusive
   10437             :  * backup is started with pg_start_backup(), and there can be only one active
   10438             :  * at a time. The backup and tablespace map files of an exclusive backup are
   10439             :  * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
   10440             :  * removed by pg_stop_backup().
   10441             :  *
   10442             :  * A non-exclusive backup is used for the streaming base backups (see
   10443             :  * src/backend/replication/basebackup.c). The difference to exclusive backups
   10444             :  * is that the backup label and tablespace map files are not written to disk.
   10445             :  * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
   10446             :  * and the caller is responsible for including them in the backup archive as
   10447             :  * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
   10448             :  * active at the same time, and they don't conflict with an exclusive backup
   10449             :  * either.
   10450             :  *
   10451             :  * tablespaces is required only when this function is called while
   10452             :  * the streaming base backup requested by pg_basebackup is running.
   10453             :  * NULL should be specified otherwise.
   10454             :  *
   10455             :  * tblspcmapfile is required mainly for tar format in windows as native windows
   10456             :  * utilities are not able to create symlinks while extracting files from tar.
   10457             :  * However for consistency, the same is used for all platforms.
   10458             :  *
   10459             :  * needtblspcmapfile is true for the cases (exclusive backup and for
   10460             :  * non-exclusive backup only when tar format is used for taking backup)
   10461             :  * when backup needs to generate tablespace_map file, it is used to
   10462             :  * embed escape character before newline character in tablespace path.
   10463             :  *
   10464             :  * Returns the minimum WAL location that must be present to restore from this
   10465             :  * backup, and the corresponding timeline ID in *starttli_p.
   10466             :  *
   10467             :  * Every successfully started non-exclusive backup must be stopped by calling
   10468             :  * do_pg_stop_backup() or do_pg_abort_backup().
   10469             :  *
   10470             :  * It is the responsibility of the caller of this function to verify the
   10471             :  * permissions of the calling user!
   10472             :  */
   10473             : XLogRecPtr
   10474         146 : do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
   10475             :                    StringInfo labelfile, List **tablespaces,
   10476             :                    StringInfo tblspcmapfile, bool infotbssize,
   10477             :                    bool needtblspcmapfile)
   10478             : {
   10479         146 :     bool        exclusive = (labelfile == NULL);
   10480         146 :     bool        backup_started_in_recovery = false;
   10481             :     XLogRecPtr  checkpointloc;
   10482             :     XLogRecPtr  startpoint;
   10483             :     TimeLineID  starttli;
   10484             :     pg_time_t   stamp_time;
   10485             :     char        strfbuf[128];
   10486             :     char        xlogfilename[MAXFNAMELEN];
   10487             :     XLogSegNo   _logSegNo;
   10488             :     struct stat stat_buf;
   10489             :     FILE       *fp;
   10490             : 
   10491         146 :     backup_started_in_recovery = RecoveryInProgress();
   10492             : 
   10493             :     /*
   10494             :      * Currently only non-exclusive backup can be taken during recovery.
   10495             :      */
   10496         146 :     if (backup_started_in_recovery && exclusive)
   10497           0 :         ereport(ERROR,
   10498             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10499             :                  errmsg("recovery is in progress"),
   10500             :                  errhint("WAL control functions cannot be executed during recovery.")));
   10501             : 
   10502             :     /*
   10503             :      * During recovery, we don't need to check WAL level. Because, if WAL
   10504             :      * level is not sufficient, it's impossible to get here during recovery.
   10505             :      */
   10506         146 :     if (!backup_started_in_recovery && !XLogIsNeeded())
   10507           0 :         ereport(ERROR,
   10508             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10509             :                  errmsg("WAL level not sufficient for making an online backup"),
   10510             :                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
   10511             : 
   10512         146 :     if (strlen(backupidstr) > MAXPGPATH)
   10513           0 :         ereport(ERROR,
   10514             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
   10515             :                  errmsg("backup label too long (max %d bytes)",
   10516             :                         MAXPGPATH)));
   10517             : 
   10518             :     /*
   10519             :      * Mark backup active in shared memory.  We must do full-page WAL writes
   10520             :      * during an on-line backup even if not doing so at other times, because
   10521             :      * it's quite possible for the backup dump to obtain a "torn" (partially
   10522             :      * written) copy of a database page if it reads the page concurrently with
   10523             :      * our write to the same page.  This can be fixed as long as the first
   10524             :      * write to the page in the WAL sequence is a full-page write. Hence, we
   10525             :      * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
   10526             :      * are no dirty pages in shared memory that might get dumped while the
   10527             :      * backup is in progress without having a corresponding WAL record.  (Once
   10528             :      * the backup is complete, we need not force full-page writes anymore,
   10529             :      * since we expect that any pages not modified during the backup interval
   10530             :      * must have been correctly captured by the backup.)
   10531             :      *
   10532             :      * Note that forcePageWrites has no effect during an online backup from
   10533             :      * the standby.
   10534             :      *
   10535             :      * We must hold all the insertion locks to change the value of
   10536             :      * forcePageWrites, to ensure adequate interlocking against
   10537             :      * XLogInsertRecord().
   10538             :      */
   10539         146 :     WALInsertLockAcquireExclusive();
   10540         146 :     if (exclusive)
   10541             :     {
   10542             :         /*
   10543             :          * At first, mark that we're now starting an exclusive backup, to
   10544             :          * ensure that there are no other sessions currently running
   10545             :          * pg_start_backup() or pg_stop_backup().
   10546             :          */
   10547           2 :         if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
   10548             :         {
   10549           0 :             WALInsertLockRelease();
   10550           0 :             ereport(ERROR,
   10551             :                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10552             :                      errmsg("a backup is already in progress"),
   10553             :                      errhint("Run pg_stop_backup() and try again.")));
   10554             :         }
   10555           2 :         XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
   10556             :     }
   10557             :     else
   10558         144 :         XLogCtl->Insert.nonExclusiveBackups++;
   10559         146 :     XLogCtl->Insert.forcePageWrites = true;
   10560         146 :     WALInsertLockRelease();
   10561             : 
   10562             :     /* Ensure we release forcePageWrites if fail below */
   10563         146 :     PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
   10564             :     {
   10565         146 :         bool        gotUniqueStartpoint = false;
   10566             :         DIR        *tblspcdir;
   10567             :         struct dirent *de;
   10568             :         tablespaceinfo *ti;
   10569             :         int         datadirpathlen;
   10570             : 
   10571             :         /*
   10572             :          * Force an XLOG file switch before the checkpoint, to ensure that the
   10573             :          * WAL segment the checkpoint is written to doesn't contain pages with
   10574             :          * old timeline IDs.  That would otherwise happen if you called
   10575             :          * pg_start_backup() right after restoring from a PITR archive: the
   10576             :          * first WAL segment containing the startup checkpoint has pages in
   10577             :          * the beginning with the old timeline ID.  That can cause trouble at
   10578             :          * recovery: we won't have a history file covering the old timeline if
   10579             :          * pg_wal directory was not included in the base backup and the WAL
   10580             :          * archive was cleared too before starting the backup.
   10581             :          *
   10582             :          * This also ensures that we have emitted a WAL page header that has
   10583             :          * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
   10584             :          * Therefore, if a WAL archiver (such as pglesslog) is trying to
   10585             :          * compress out removable backup blocks, it won't remove any that
   10586             :          * occur after this point.
   10587             :          *
   10588             :          * During recovery, we skip forcing XLOG file switch, which means that
   10589             :          * the backup taken during recovery is not available for the special
   10590             :          * recovery case described above.
   10591             :          */
   10592         146 :         if (!backup_started_in_recovery)
   10593         142 :             RequestXLogSwitch(false);
   10594             : 
   10595             :         do
   10596             :         {
   10597             :             bool        checkpointfpw;
   10598             : 
   10599             :             /*
   10600             :              * Force a CHECKPOINT.  Aside from being necessary to prevent torn
   10601             :              * page problems, this guarantees that two successive backup runs
   10602             :              * will have different checkpoint positions and hence different
   10603             :              * history file names, even if nothing happened in between.
   10604             :              *
   10605             :              * During recovery, establish a restartpoint if possible. We use
   10606             :              * the last restartpoint as the backup starting checkpoint. This
   10607             :              * means that two successive backup runs can have same checkpoint
   10608             :              * positions.
   10609             :              *
   10610             :              * Since the fact that we are executing do_pg_start_backup()
   10611             :              * during recovery means that checkpointer is running, we can use
   10612             :              * RequestCheckpoint() to establish a restartpoint.
   10613             :              *
   10614             :              * We use CHECKPOINT_IMMEDIATE only if requested by user (via
   10615             :              * passing fast = true).  Otherwise this can take awhile.
   10616             :              */
   10617         146 :             RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
   10618             :                               (fast ? CHECKPOINT_IMMEDIATE : 0));
   10619             : 
   10620             :             /*
   10621             :              * Now we need to fetch the checkpoint record location, and also
   10622             :              * its REDO pointer.  The oldest point in WAL that would be needed
   10623             :              * to restore starting from the checkpoint is precisely the REDO
   10624             :              * pointer.
   10625             :              */
   10626         146 :             LWLockAcquire(ControlFileLock, LW_SHARED);
   10627         146 :             checkpointloc = ControlFile->checkPoint;
   10628         146 :             startpoint = ControlFile->checkPointCopy.redo;
   10629         146 :             starttli = ControlFile->checkPointCopy.ThisTimeLineID;
   10630         146 :             checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
   10631         146 :             LWLockRelease(ControlFileLock);
   10632             : 
   10633         146 :             if (backup_started_in_recovery)
   10634             :             {
   10635             :                 XLogRecPtr  recptr;
   10636             : 
   10637             :                 /*
   10638             :                  * Check to see if all WAL replayed during online backup
   10639             :                  * (i.e., since last restartpoint used as backup starting
   10640             :                  * checkpoint) contain full-page writes.
   10641             :                  */
   10642           4 :                 SpinLockAcquire(&XLogCtl->info_lck);
   10643           4 :                 recptr = XLogCtl->lastFpwDisableRecPtr;
   10644           4 :                 SpinLockRelease(&XLogCtl->info_lck);
   10645             : 
   10646           4 :                 if (!checkpointfpw || startpoint <= recptr)
   10647           0 :                     ereport(ERROR,
   10648             :                             (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10649             :                              errmsg("WAL generated with full_page_writes=off was replayed "
   10650             :                                     "since last restartpoint"),
   10651             :                              errhint("This means that the backup being taken on the standby "
   10652             :                                      "is corrupt and should not be used. "
   10653             :                                      "Enable full_page_writes and run CHECKPOINT on the master, "
   10654             :                                      "and then try an online backup again.")));
   10655             : 
   10656             :                 /*
   10657             :                  * During recovery, since we don't use the end-of-backup WAL
   10658             :                  * record and don't write the backup history file, the
   10659             :                  * starting WAL location doesn't need to be unique. This means
   10660             :                  * that two base backups started at the same time might use
   10661             :                  * the same checkpoint as starting locations.
   10662             :                  */
   10663           4 :                 gotUniqueStartpoint = true;
   10664             :             }
   10665             : 
   10666             :             /*
   10667             :              * If two base backups are started at the same time (in WAL sender
   10668             :              * processes), we need to make sure that they use different
   10669             :              * checkpoints as starting locations, because we use the starting
   10670             :              * WAL location as a unique identifier for the base backup in the
   10671             :              * end-of-backup WAL record and when we write the backup history
   10672             :              * file. Perhaps it would be better generate a separate unique ID
   10673             :              * for each backup instead of forcing another checkpoint, but
   10674             :              * taking a checkpoint right after another is not that expensive
   10675             :              * either because only few buffers have been dirtied yet.
   10676             :              */
   10677         146 :             WALInsertLockAcquireExclusive();
   10678         146 :             if (XLogCtl->Insert.lastBackupStart < startpoint)
   10679             :             {
   10680         146 :                 XLogCtl->Insert.lastBackupStart = startpoint;
   10681         146 :                 gotUniqueStartpoint = true;
   10682             :             }
   10683         146 :             WALInsertLockRelease();
   10684         146 :         } while (!gotUniqueStartpoint);
   10685             : 
   10686         146 :         XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
   10687         146 :         XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
   10688             : 
   10689             :         /*
   10690             :          * Construct tablespace_map file
   10691             :          */
   10692         146 :         if (exclusive)
   10693           2 :             tblspcmapfile = makeStringInfo();
   10694             : 
   10695         146 :         datadirpathlen = strlen(DataDir);
   10696             : 
   10697             :         /*
   10698             :          * Report that we are now estimating the total backup size if we're
   10699             :          * streaming base backup as requested by pg_basebackup
   10700             :          */
   10701         146 :         if (tablespaces)
   10702         144 :             pgstat_progress_update_par