LCOV - PostgreSQL 19devel - src/backend/access/transam/xlog.c

LCOV - code coverage report

Current view:	top level - src/backend/access/transam - xlog.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL 19devel	Lines:	2200	2480	88.7 %
Date:	2025-07-05 23:18:39	Functions:	119	122	97.5 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * xlog.c
       4             :  *      PostgreSQL write-ahead log manager
       5             :  *
       6             :  * The Write-Ahead Log (WAL) functionality is split into several source
       7             :  * files, in addition to this one:
       8             :  *
       9             :  * xloginsert.c - Functions for constructing WAL records
      10             :  * xlogrecovery.c - WAL recovery and standby code
      11             :  * xlogreader.c - Facility for reading WAL files and parsing WAL records
      12             :  * xlogutils.c - Helper functions for WAL redo routines
      13             :  *
      14             :  * This file contains functions for coordinating database startup and
      15             :  * checkpointing, and managing the write-ahead log buffers when the
      16             :  * system is running.
      17             :  *
      18             :  * StartupXLOG() is the main entry point of the startup process.  It
      19             :  * coordinates database startup, performing WAL recovery, and the
      20             :  * transition from WAL recovery into normal operations.
      21             :  *
      22             :  * XLogInsertRecord() inserts a WAL record into the WAL buffers.  Most
      23             :  * callers should not call this directly, but use the functions in
      24             :  * xloginsert.c to construct the WAL record.  XLogFlush() can be used
      25             :  * to force the WAL to disk.
      26             :  *
      27             :  * In addition to those, there are many other functions for interrogating
      28             :  * the current system state, and for starting/stopping backups.
      29             :  *
      30             :  *
      31             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      32             :  * Portions Copyright (c) 1994, Regents of the University of California
      33             :  *
      34             :  * src/backend/access/transam/xlog.c
      35             :  *
      36             :  *-------------------------------------------------------------------------
      37             :  */
      38             : 
      39             : #include "postgres.h"
      40             : 
      41             : #include <ctype.h>
      42             : #include <math.h>
      43             : #include <time.h>
      44             : #include <fcntl.h>
      45             : #include <sys/stat.h>
      46             : #include <sys/time.h>
      47             : #include <unistd.h>
      48             : 
      49             : #include "access/clog.h"
      50             : #include "access/commit_ts.h"
      51             : #include "access/heaptoast.h"
      52             : #include "access/multixact.h"
      53             : #include "access/rewriteheap.h"
      54             : #include "access/subtrans.h"
      55             : #include "access/timeline.h"
      56             : #include "access/transam.h"
      57             : #include "access/twophase.h"
      58             : #include "access/xact.h"
      59             : #include "access/xlog_internal.h"
      60             : #include "access/xlogarchive.h"
      61             : #include "access/xloginsert.h"
      62             : #include "access/xlogreader.h"
      63             : #include "access/xlogrecovery.h"
      64             : #include "access/xlogutils.h"
      65             : #include "backup/basebackup.h"
      66             : #include "catalog/catversion.h"
      67             : #include "catalog/pg_control.h"
      68             : #include "catalog/pg_database.h"
      69             : #include "common/controldata_utils.h"
      70             : #include "common/file_utils.h"
      71             : #include "executor/instrument.h"
      72             : #include "miscadmin.h"
      73             : #include "pg_trace.h"
      74             : #include "pgstat.h"
      75             : #include "port/atomics.h"
      76             : #include "postmaster/bgwriter.h"
      77             : #include "postmaster/startup.h"
      78             : #include "postmaster/walsummarizer.h"
      79             : #include "postmaster/walwriter.h"
      80             : #include "replication/origin.h"
      81             : #include "replication/slot.h"
      82             : #include "replication/snapbuild.h"
      83             : #include "replication/walreceiver.h"
      84             : #include "replication/walsender.h"
      85             : #include "storage/bufmgr.h"
      86             : #include "storage/fd.h"
      87             : #include "storage/ipc.h"
      88             : #include "storage/large_object.h"
      89             : #include "storage/latch.h"
      90             : #include "storage/predicate.h"
      91             : #include "storage/proc.h"
      92             : #include "storage/procarray.h"
      93             : #include "storage/reinit.h"
      94             : #include "storage/spin.h"
      95             : #include "storage/sync.h"
      96             : #include "utils/guc_hooks.h"
      97             : #include "utils/guc_tables.h"
      98             : #include "utils/injection_point.h"
      99             : #include "utils/ps_status.h"
     100             : #include "utils/relmapper.h"
     101             : #include "utils/snapmgr.h"
     102             : #include "utils/timeout.h"
     103             : #include "utils/timestamp.h"
     104             : #include "utils/varlena.h"
     105             : 
     106             : #ifdef WAL_DEBUG
     107             : #include "utils/memutils.h"
     108             : #endif
     109             : 
     110             : /* timeline ID to be used when bootstrapping */
     111             : #define BootstrapTimeLineID     1
     112             : 
     113             : /* User-settable parameters */
     114             : int         max_wal_size_mb = 1024; /* 1 GB */
     115             : int         min_wal_size_mb = 80;   /* 80 MB */
     116             : int         wal_keep_size_mb = 0;
     117             : int         XLOGbuffers = -1;
     118             : int         XLogArchiveTimeout = 0;
     119             : int         XLogArchiveMode = ARCHIVE_MODE_OFF;
     120             : char       *XLogArchiveCommand = NULL;
     121             : bool        EnableHotStandby = false;
     122             : bool        fullPageWrites = true;
     123             : bool        wal_log_hints = false;
     124             : int         wal_compression = WAL_COMPRESSION_NONE;
     125             : char       *wal_consistency_checking_string = NULL;
     126             : bool       *wal_consistency_checking = NULL;
     127             : bool        wal_init_zero = true;
     128             : bool        wal_recycle = true;
     129             : bool        log_checkpoints = true;
     130             : int         wal_sync_method = DEFAULT_WAL_SYNC_METHOD;
     131             : int         wal_level = WAL_LEVEL_REPLICA;
     132             : int         CommitDelay = 0;    /* precommit delay in microseconds */
     133             : int         CommitSiblings = 5; /* # concurrent xacts needed to sleep */
     134             : int         wal_retrieve_retry_interval = 5000;
     135             : int         max_slot_wal_keep_size_mb = -1;
     136             : int         wal_decode_buffer_size = 512 * 1024;
     137             : bool        track_wal_io_timing = false;
     138             : 
     139             : #ifdef WAL_DEBUG
     140             : bool        XLOG_DEBUG = false;
     141             : #endif
     142             : 
     143             : int         wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
     144             : 
     145             : /*
     146             :  * Number of WAL insertion locks to use. A higher value allows more insertions
     147             :  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
     148             :  * which needs to iterate all the locks.
     149             :  */
     150             : #define NUM_XLOGINSERT_LOCKS  8
     151             : 
     152             : /*
     153             :  * Max distance from last checkpoint, before triggering a new xlog-based
     154             :  * checkpoint.
     155             :  */
     156             : int         CheckPointSegments;
     157             : 
     158             : /* Estimated distance between checkpoints, in bytes */
     159             : static double CheckPointDistanceEstimate = 0;
     160             : static double PrevCheckPointDistance = 0;
     161             : 
     162             : /*
     163             :  * Track whether there were any deferred checks for custom resource managers
     164             :  * specified in wal_consistency_checking.
     165             :  */
     166             : static bool check_wal_consistency_checking_deferred = false;
     167             : 
     168             : /*
     169             :  * GUC support
     170             :  */
     171             : const struct config_enum_entry wal_sync_method_options[] = {
     172             :     {"fsync", WAL_SYNC_METHOD_FSYNC, false},
     173             : #ifdef HAVE_FSYNC_WRITETHROUGH
     174             :     {"fsync_writethrough", WAL_SYNC_METHOD_FSYNC_WRITETHROUGH, false},
     175             : #endif
     176             :     {"fdatasync", WAL_SYNC_METHOD_FDATASYNC, false},
     177             : #ifdef O_SYNC
     178             :     {"open_sync", WAL_SYNC_METHOD_OPEN, false},
     179             : #endif
     180             : #ifdef O_DSYNC
     181             :     {"open_datasync", WAL_SYNC_METHOD_OPEN_DSYNC, false},
     182             : #endif
     183             :     {NULL, 0, false}
     184             : };
     185             : 
     186             : 
     187             : /*
     188             :  * Although only "on", "off", and "always" are documented,
     189             :  * we accept all the likely variants of "on" and "off".
     190             :  */
     191             : const struct config_enum_entry archive_mode_options[] = {
     192             :     {"always", ARCHIVE_MODE_ALWAYS, false},
     193             :     {"on", ARCHIVE_MODE_ON, false},
     194             :     {"off", ARCHIVE_MODE_OFF, false},
     195             :     {"true", ARCHIVE_MODE_ON, true},
     196             :     {"false", ARCHIVE_MODE_OFF, true},
     197             :     {"yes", ARCHIVE_MODE_ON, true},
     198             :     {"no", ARCHIVE_MODE_OFF, true},
     199             :     {"1", ARCHIVE_MODE_ON, true},
     200             :     {"0", ARCHIVE_MODE_OFF, true},
     201             :     {NULL, 0, false}
     202             : };
     203             : 
     204             : /*
     205             :  * Statistics for current checkpoint are collected in this global struct.
     206             :  * Because only the checkpointer or a stand-alone backend can perform
     207             :  * checkpoints, this will be unused in normal backends.
     208             :  */
     209             : CheckpointStatsData CheckpointStats;
     210             : 
     211             : /*
     212             :  * During recovery, lastFullPageWrites keeps track of full_page_writes that
     213             :  * the replayed WAL records indicate. It's initialized with full_page_writes
     214             :  * that the recovery starting checkpoint record indicates, and then updated
     215             :  * each time XLOG_FPW_CHANGE record is replayed.
     216             :  */
     217             : static bool lastFullPageWrites;
     218             : 
     219             : /*
     220             :  * Local copy of the state tracked by SharedRecoveryState in shared memory,
     221             :  * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
     222             :  * means "not known, need to check the shared state".
     223             :  */
     224             : static bool LocalRecoveryInProgress = true;
     225             : 
     226             : /*
     227             :  * Local state for XLogInsertAllowed():
     228             :  *      1: unconditionally allowed to insert XLOG
     229             :  *      0: unconditionally not allowed to insert XLOG
     230             :  *      -1: must check RecoveryInProgress(); disallow until it is false
     231             :  * Most processes start with -1 and transition to 1 after seeing that recovery
     232             :  * is not in progress.  But we can also force the value for special cases.
     233             :  * The coding in XLogInsertAllowed() depends on the first two of these states
     234             :  * being numerically the same as bool true and false.
     235             :  */
     236             : static int  LocalXLogInsertAllowed = -1;
     237             : 
     238             : /*
     239             :  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
     240             :  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
     241             :  * end+1 of the last record, and is reset when we end a top-level transaction,
     242             :  * or start a new one; so it can be used to tell if the current transaction has
     243             :  * created any XLOG records.
     244             :  *
     245             :  * While in parallel mode, this may not be fully up to date.  When committing,
     246             :  * a transaction can assume this covers all xlog records written either by the
     247             :  * user backend or by any parallel worker which was present at any point during
     248             :  * the transaction.  But when aborting, or when still in parallel mode, other
     249             :  * parallel backends may have written WAL records at later LSNs than the value
     250             :  * stored here.  The parallel leader advances its own copy, when necessary,
     251             :  * in WaitForParallelWorkersToFinish.
     252             :  */
     253             : XLogRecPtr  ProcLastRecPtr = InvalidXLogRecPtr;
     254             : XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
     255             : XLogRecPtr  XactLastCommitEnd = InvalidXLogRecPtr;
     256             : 
     257             : /*
     258             :  * RedoRecPtr is this backend's local copy of the REDO record pointer
     259             :  * (which is almost but not quite the same as a pointer to the most recent
     260             :  * CHECKPOINT record).  We update this from the shared-memory copy,
     261             :  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
     262             :  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
     263             :  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
     264             :  * see GetRedoRecPtr.
     265             :  *
     266             :  * NB: Code that uses this variable must be prepared not only for the
     267             :  * possibility that it may be arbitrarily out of date, but also for the
     268             :  * possibility that it might be set to InvalidXLogRecPtr. We used to
     269             :  * initialize it as a side effect of the first call to RecoveryInProgress(),
     270             :  * which meant that most code that might use it could assume that it had a
     271             :  * real if perhaps stale value. That's no longer the case.
     272             :  */
     273             : static XLogRecPtr RedoRecPtr;
     274             : 
     275             : /*
     276             :  * doPageWrites is this backend's local copy of (fullPageWrites ||
     277             :  * runningBackups > 0).  It is used together with RedoRecPtr to decide whether
     278             :  * a full-page image of a page need to be taken.
     279             :  *
     280             :  * NB: Initially this is false, and there's no guarantee that it will be
     281             :  * initialized to any other value before it is first used. Any code that
     282             :  * makes use of it must recheck the value after obtaining a WALInsertLock,
     283             :  * and respond appropriately if it turns out that the previous value wasn't
     284             :  * accurate.
     285             :  */
     286             : static bool doPageWrites;
     287             : 
     288             : /*----------
     289             :  * Shared-memory data structures for XLOG control
     290             :  *
     291             :  * LogwrtRqst indicates a byte position that we need to write and/or fsync
     292             :  * the log up to (all records before that point must be written or fsynced).
     293             :  * The positions already written/fsynced are maintained in logWriteResult
     294             :  * and logFlushResult using atomic access.
     295             :  * In addition to the shared variable, each backend has a private copy of
     296             :  * both in LogwrtResult, which is updated when convenient.
     297             :  *
     298             :  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
     299             :  * (protected by info_lck), but we don't need to cache any copies of it.
     300             :  *
     301             :  * info_lck is only held long enough to read/update the protected variables,
     302             :  * so it's a plain spinlock.  The other locks are held longer (potentially
     303             :  * over I/O operations), so we use LWLocks for them.  These locks are:
     304             :  *
     305             :  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
     306             :  * XLogFlush).
     307             :  *
     308             :  * ControlFileLock: must be held to read/update control file or create
     309             :  * new log file.
     310             :  *
     311             :  *----------
     312             :  */
     313             : 
     314             : typedef struct XLogwrtRqst
     315             : {
     316             :     XLogRecPtr  Write;          /* last byte + 1 to write out */
     317             :     XLogRecPtr  Flush;          /* last byte + 1 to flush */
     318             : } XLogwrtRqst;
     319             : 
     320             : typedef struct XLogwrtResult
     321             : {
     322             :     XLogRecPtr  Write;          /* last byte + 1 written out */
     323             :     XLogRecPtr  Flush;          /* last byte + 1 flushed */
     324             : } XLogwrtResult;
     325             : 
     326             : /*
     327             :  * Inserting to WAL is protected by a small fixed number of WAL insertion
     328             :  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
     329             :  * matter which one. To lock out other concurrent insertions, you must hold
     330             :  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
     331             :  * indicator of how far the insertion has progressed (insertingAt).
     332             :  *
     333             :  * The insertingAt values are read when a process wants to flush WAL from
     334             :  * the in-memory buffers to disk, to check that all the insertions to the
     335             :  * region the process is about to write out have finished. You could simply
     336             :  * wait for all currently in-progress insertions to finish, but the
     337             :  * insertingAt indicator allows you to ignore insertions to later in the WAL,
     338             :  * so that you only wait for the insertions that are modifying the buffers
     339             :  * you're about to write out.
     340             :  *
     341             :  * This isn't just an optimization. If all the WAL buffers are dirty, an
     342             :  * inserter that's holding a WAL insert lock might need to evict an old WAL
     343             :  * buffer, which requires flushing the WAL. If it's possible for an inserter
     344             :  * to block on another inserter unnecessarily, deadlock can arise when two
     345             :  * inserters holding a WAL insert lock wait for each other to finish their
     346             :  * insertion.
     347             :  *
     348             :  * Small WAL records that don't cross a page boundary never update the value,
     349             :  * the WAL record is just copied to the page and the lock is released. But
     350             :  * to avoid the deadlock-scenario explained above, the indicator is always
     351             :  * updated before sleeping while holding an insertion lock.
     352             :  *
     353             :  * lastImportantAt contains the LSN of the last important WAL record inserted
     354             :  * using a given lock. This value is used to detect if there has been
     355             :  * important WAL activity since the last time some action, like a checkpoint,
     356             :  * was performed - allowing to not repeat the action if not. The LSN is
     357             :  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
     358             :  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
     359             :  * records.  Tracking the WAL activity directly in WALInsertLock has the
     360             :  * advantage of not needing any additional locks to update the value.
     361             :  */
     362             : typedef struct
     363             : {
     364             :     LWLock      lock;
     365             :     pg_atomic_uint64 insertingAt;
     366             :     XLogRecPtr  lastImportantAt;
     367             : } WALInsertLock;
     368             : 
     369             : /*
     370             :  * All the WAL insertion locks are allocated as an array in shared memory. We
     371             :  * force the array stride to be a power of 2, which saves a few cycles in
     372             :  * indexing, but more importantly also ensures that individual slots don't
     373             :  * cross cache line boundaries. (Of course, we have to also ensure that the
     374             :  * array start address is suitably aligned.)
     375             :  */
     376             : typedef union WALInsertLockPadded
     377             : {
     378             :     WALInsertLock l;
     379             :     char        pad[PG_CACHE_LINE_SIZE];
     380             : } WALInsertLockPadded;
     381             : 
     382             : /*
     383             :  * Session status of running backup, used for sanity checks in SQL-callable
     384             :  * functions to start and stop backups.
     385             :  */
     386             : static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
     387             : 
     388             : /*
     389             :  * Shared state data for WAL insertion.
     390             :  */
     391             : typedef struct XLogCtlInsert
     392             : {
     393             :     slock_t     insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
     394             : 
     395             :     /*
     396             :      * CurrBytePos is the end of reserved WAL. The next record will be
     397             :      * inserted at that position. PrevBytePos is the start position of the
     398             :      * previously inserted (or rather, reserved) record - it is copied to the
     399             :      * prev-link of the next record. These are stored as "usable byte
     400             :      * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
     401             :      */
     402             :     uint64      CurrBytePos;
     403             :     uint64      PrevBytePos;
     404             : 
     405             :     /*
     406             :      * Make sure the above heavily-contended spinlock and byte positions are
     407             :      * on their own cache line. In particular, the RedoRecPtr and full page
     408             :      * write variables below should be on a different cache line. They are
     409             :      * read on every WAL insertion, but updated rarely, and we don't want
     410             :      * those reads to steal the cache line containing Curr/PrevBytePos.
     411             :      */
     412             :     char        pad[PG_CACHE_LINE_SIZE];
     413             : 
     414             :     /*
     415             :      * fullPageWrites is the authoritative value used by all backends to
     416             :      * determine whether to write full-page image to WAL. This shared value,
     417             :      * instead of the process-local fullPageWrites, is required because, when
     418             :      * full_page_writes is changed by SIGHUP, we must WAL-log it before it
     419             :      * actually affects WAL-logging by backends.  Checkpointer sets at startup
     420             :      * or after SIGHUP.
     421             :      *
     422             :      * To read these fields, you must hold an insertion lock. To modify them,
     423             :      * you must hold ALL the locks.
     424             :      */
     425             :     XLogRecPtr  RedoRecPtr;     /* current redo point for insertions */
     426             :     bool        fullPageWrites;
     427             : 
     428             :     /*
     429             :      * runningBackups is a counter indicating the number of backups currently
     430             :      * in progress. lastBackupStart is the latest checkpoint redo location
     431             :      * used as a starting point for an online backup.
     432             :      */
     433             :     int         runningBackups;
     434             :     XLogRecPtr  lastBackupStart;
     435             : 
     436             :     /*
     437             :      * WAL insertion locks.
     438             :      */
     439             :     WALInsertLockPadded *WALInsertLocks;
     440             : } XLogCtlInsert;
     441             : 
     442             : /*
     443             :  * Total shared-memory state for XLOG.
     444             :  */
     445             : typedef struct XLogCtlData
     446             : {
     447             :     XLogCtlInsert Insert;
     448             : 
     449             :     /* Protected by info_lck: */
     450             :     XLogwrtRqst LogwrtRqst;
     451             :     XLogRecPtr  RedoRecPtr;     /* a recent copy of Insert->RedoRecPtr */
     452             :     FullTransactionId ckptFullXid;  /* nextXid of latest checkpoint */
     453             :     XLogRecPtr  asyncXactLSN;   /* LSN of newest async commit/abort */
     454             :     XLogRecPtr  replicationSlotMinLSN;  /* oldest LSN needed by any slot */
     455             : 
     456             :     XLogSegNo   lastRemovedSegNo;   /* latest removed/recycled XLOG segment */
     457             : 
     458             :     /* Fake LSN counter, for unlogged relations. */
     459             :     pg_atomic_uint64 unloggedLSN;
     460             : 
     461             :     /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
     462             :     pg_time_t   lastSegSwitchTime;
     463             :     XLogRecPtr  lastSegSwitchLSN;
     464             : 
     465             :     /* These are accessed using atomics -- info_lck not needed */
     466             :     pg_atomic_uint64 logInsertResult;   /* last byte + 1 inserted to buffers */
     467             :     pg_atomic_uint64 logWriteResult;    /* last byte + 1 written out */
     468             :     pg_atomic_uint64 logFlushResult;    /* last byte + 1 flushed */
     469             : 
     470             :     /*
     471             :      * First initialized page in the cache (first byte position).
     472             :      */
     473             :     XLogRecPtr  InitializedFrom;
     474             : 
     475             :     /*
     476             :      * Latest reserved for initialization page in the cache (last byte
     477             :      * position + 1).
     478             :      *
     479             :      * To change the identity of a buffer, you need to advance
     480             :      * InitializeReserved first.  To change the identity of a buffer that's
     481             :      * still dirty, the old page needs to be written out first, and for that
     482             :      * you need WALWriteLock, and you need to ensure that there are no
     483             :      * in-progress insertions to the page by calling
     484             :      * WaitXLogInsertionsToFinish().
     485             :      */
     486             :     pg_atomic_uint64 InitializeReserved;
     487             : 
     488             :     /*
     489             :      * Latest initialized page in the cache (last byte position + 1).
     490             :      *
     491             :      * InitializedUpTo is updated after the buffer initialization.  After
     492             :      * update, waiters got notification using InitializedUpToCondVar.
     493             :      */
     494             :     pg_atomic_uint64 InitializedUpTo;
     495             :     ConditionVariable InitializedUpToCondVar;
     496             : 
     497             :     /*
     498             :      * These values do not change after startup, although the pointed-to pages
     499             :      * and xlblocks values certainly do.  xlblocks values are changed
     500             :      * lock-free according to the check for the xlog write position and are
     501             :      * accompanied by changes of InitializeReserved and InitializedUpTo.
     502             :      */
     503             :     char       *pages;          /* buffers for unwritten XLOG pages */
     504             :     pg_atomic_uint64 *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
     505             :     int         XLogCacheBlck;  /* highest allocated xlog buffer index */
     506             : 
     507             :     /*
     508             :      * InsertTimeLineID is the timeline into which new WAL is being inserted
     509             :      * and flushed. It is zero during recovery, and does not change once set.
     510             :      *
     511             :      * If we create a new timeline when the system was started up,
     512             :      * PrevTimeLineID is the old timeline's ID that we forked off from.
     513             :      * Otherwise it's equal to InsertTimeLineID.
     514             :      *
     515             :      * We set these fields while holding info_lck. Most that reads these
     516             :      * values knows that recovery is no longer in progress and so can safely
     517             :      * read the value without a lock, but code that could be run either during
     518             :      * or after recovery can take info_lck while reading these values.
     519             :      */
     520             :     TimeLineID  InsertTimeLineID;
     521             :     TimeLineID  PrevTimeLineID;
     522             : 
     523             :     /*
     524             :      * SharedRecoveryState indicates if we're still in crash or archive
     525             :      * recovery.  Protected by info_lck.
     526             :      */
     527             :     RecoveryState SharedRecoveryState;
     528             : 
     529             :     /*
     530             :      * InstallXLogFileSegmentActive indicates whether the checkpointer should
     531             :      * arrange for future segments by recycling and/or PreallocXlogFiles().
     532             :      * Protected by ControlFileLock.  Only the startup process changes it.  If
     533             :      * true, anyone can use InstallXLogFileSegment().  If false, the startup
     534             :      * process owns the exclusive right to install segments, by reading from
     535             :      * the archive and possibly replacing existing files.
     536             :      */
     537             :     bool        InstallXLogFileSegmentActive;
     538             : 
     539             :     /*
     540             :      * WalWriterSleeping indicates whether the WAL writer is currently in
     541             :      * low-power mode (and hence should be nudged if an async commit occurs).
     542             :      * Protected by info_lck.
     543             :      */
     544             :     bool        WalWriterSleeping;
     545             : 
     546             :     /*
     547             :      * During recovery, we keep a copy of the latest checkpoint record here.
     548             :      * lastCheckPointRecPtr points to start of checkpoint record and
     549             :      * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
     550             :      * checkpointer when it wants to create a restartpoint.
     551             :      *
     552             :      * Protected by info_lck.
     553             :      */
     554             :     XLogRecPtr  lastCheckPointRecPtr;
     555             :     XLogRecPtr  lastCheckPointEndPtr;
     556             :     CheckPoint  lastCheckPoint;
     557             : 
     558             :     /*
     559             :      * lastFpwDisableRecPtr points to the start of the last replayed
     560             :      * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
     561             :      */
     562             :     XLogRecPtr  lastFpwDisableRecPtr;
     563             : 
     564             :     slock_t     info_lck;       /* locks shared variables shown above */
     565             : } XLogCtlData;
     566             : 
     567             : /*
     568             :  * Classification of XLogInsertRecord operations.
     569             :  */
     570             : typedef enum
     571             : {
     572             :     WALINSERT_NORMAL,
     573             :     WALINSERT_SPECIAL_SWITCH,
     574             :     WALINSERT_SPECIAL_CHECKPOINT
     575             : } WalInsertClass;
     576             : 
     577             : static XLogCtlData *XLogCtl = NULL;
     578             : 
     579             : /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
     580             : static WALInsertLockPadded *WALInsertLocks = NULL;
     581             : 
     582             : /*
     583             :  * We maintain an image of pg_control in shared memory.
     584             :  */
     585             : static ControlFileData *ControlFile = NULL;
     586             : 
     587             : /*
     588             :  * Calculate the amount of space left on the page after 'endptr'. Beware
     589             :  * multiple evaluation!
     590             :  */
     591             : #define INSERT_FREESPACE(endptr)    \
     592             :     (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
     593             : 
     594             : /* Macro to advance to next buffer index. */
     595             : #define NextBufIdx(idx)     \
     596             :         (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
     597             : 
     598             : /*
     599             :  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
     600             :  * would hold if it was in cache, the page containing 'recptr'.
     601             :  */
     602             : #define XLogRecPtrToBufIdx(recptr)  \
     603             :     (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
     604             : 
     605             : /*
     606             :  * These are the number of bytes in a WAL page usable for WAL data.
     607             :  */
     608             : #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
     609             : 
     610             : /*
     611             :  * Convert values of GUCs measured in megabytes to equiv. segment count.
     612             :  * Rounds down.
     613             :  */
     614             : #define ConvertToXSegs(x, segsize)  XLogMBVarToSegs((x), (segsize))
     615             : 
     616             : /* The number of bytes in a WAL segment usable for WAL data. */
     617             : static int  UsableBytesInSegment;
     618             : 
     619             : /*
     620             :  * Private, possibly out-of-date copy of shared LogwrtResult.
     621             :  * See discussion above.
     622             :  */
     623             : static XLogwrtResult LogwrtResult = {0, 0};
     624             : 
     625             : /*
     626             :  * Update local copy of shared XLogCtl->log{Write,Flush}Result
     627             :  *
     628             :  * It's critical that Flush always trails Write, so the order of the reads is
     629             :  * important, as is the barrier.  See also XLogWrite.
     630             :  */
     631             : #define RefreshXLogWriteResult(_target) \
     632             :     do { \
     633             :         _target.Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult); \
     634             :         pg_read_barrier(); \
     635             :         _target.Write = pg_atomic_read_u64(&XLogCtl->logWriteResult); \
     636             :     } while (0)
     637             : 
     638             : /*
     639             :  * openLogFile is -1 or a kernel FD for an open log file segment.
     640             :  * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
     641             :  * These variables are only used to write the XLOG, and so will normally refer
     642             :  * to the active segment.
     643             :  *
     644             :  * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
     645             :  */
     646             : static int  openLogFile = -1;
     647             : static XLogSegNo openLogSegNo = 0;
     648             : static TimeLineID openLogTLI = 0;
     649             : 
     650             : /*
     651             :  * Local copies of equivalent fields in the control file.  When running
     652             :  * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
     653             :  * expect to replay all the WAL available, and updateMinRecoveryPoint is
     654             :  * switched to false to prevent any updates while replaying records.
     655             :  * Those values are kept consistent as long as crash recovery runs.
     656             :  */
     657             : static XLogRecPtr LocalMinRecoveryPoint;
     658             : static TimeLineID LocalMinRecoveryPointTLI;
     659             : static bool updateMinRecoveryPoint = true;
     660             : 
     661             : /* For WALInsertLockAcquire/Release functions */
     662             : static int  MyLockNo = 0;
     663             : static bool holdingAllLocks = false;
     664             : 
     665             : #ifdef WAL_DEBUG
     666             : static MemoryContext walDebugCxt = NULL;
     667             : #endif
     668             : 
     669             : static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
     670             :                                         XLogRecPtr EndOfLog,
     671             :                                         TimeLineID newTLI);
     672             : static void CheckRequiredParameterValues(void);
     673             : static void XLogReportParameters(void);
     674             : static int  LocalSetXLogInsertAllowed(void);
     675             : static void CreateEndOfRecoveryRecord(void);
     676             : static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
     677             :                                                   XLogRecPtr pagePtr,
     678             :                                                   TimeLineID newTLI);
     679             : static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
     680             : static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
     681             : static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
     682             : 
     683             : static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
     684             :                                   bool opportunistic);
     685             : static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
     686             : static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
     687             :                                    bool find_free, XLogSegNo max_segno,
     688             :                                    TimeLineID tli);
     689             : static void XLogFileClose(void);
     690             : static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
     691             : static void RemoveTempXlogFiles(void);
     692             : static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
     693             :                                XLogRecPtr endptr, TimeLineID insertTLI);
     694             : static void RemoveXlogFile(const struct dirent *segment_de,
     695             :                            XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
     696             :                            TimeLineID insertTLI);
     697             : static void UpdateLastRemovedPtr(char *filename);
     698             : static void ValidateXLOGDirectoryStructure(void);
     699             : static void CleanupBackupHistory(void);
     700             : static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
     701             : static bool PerformRecoveryXLogAction(void);
     702             : static void InitControlFile(uint64 sysidentifier, uint32 data_checksum_version);
     703             : static void WriteControlFile(void);
     704             : static void ReadControlFile(void);
     705             : static void UpdateControlFile(void);
     706             : static char *str_time(pg_time_t tnow);
     707             : 
     708             : static int  get_sync_bit(int method);
     709             : 
     710             : static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
     711             :                                 XLogRecData *rdata,
     712             :                                 XLogRecPtr StartPos, XLogRecPtr EndPos,
     713             :                                 TimeLineID tli);
     714             : static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
     715             :                                       XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
     716             : static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
     717             :                               XLogRecPtr *PrevPtr);
     718             : static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
     719             : static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
     720             : static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
     721             : static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
     722             : static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
     723             : 
     724             : static void WALInsertLockAcquire(void);
     725             : static void WALInsertLockAcquireExclusive(void);
     726             : static void WALInsertLockRelease(void);
     727             : static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
     728             : 
     729             : /*
     730             :  * Insert an XLOG record represented by an already-constructed chain of data
     731             :  * chunks.  This is a low-level routine; to construct the WAL record header
     732             :  * and data, use the higher-level routines in xloginsert.c.
     733             :  *
     734             :  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
     735             :  * WAL record applies to, that were not included in the record as full page
     736             :  * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
     737             :  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
     738             :  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
     739             :  * record is always inserted.
     740             :  *
     741             :  * 'flags' gives more in-depth control on the record being inserted. See
     742             :  * XLogSetRecordFlags() for details.
     743             :  *
     744             :  * 'topxid_included' tells whether the top-transaction id is logged along with
     745             :  * current subtransaction. See XLogRecordAssemble().
     746             :  *
     747             :  * The first XLogRecData in the chain must be for the record header, and its
     748             :  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
     749             :  * xl_crc fields in the header, the rest of the header must already be filled
     750             :  * by the caller.
     751             :  *
     752             :  * Returns XLOG pointer to end of record (beginning of next record).
     753             :  * This can be used as LSN for data pages affected by the logged action.
     754             :  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
     755             :  * before the data page can be written out.  This implements the basic
     756             :  * WAL rule "write the log before the data".)
     757             :  */
     758             : XLogRecPtr
     759    29249358 : XLogInsertRecord(XLogRecData *rdata,
     760             :                  XLogRecPtr fpw_lsn,
     761             :                  uint8 flags,
     762             :                  int num_fpi,
     763             :                  bool topxid_included)
     764             : {
     765    29249358 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
     766             :     pg_crc32c   rdata_crc;
     767             :     bool        inserted;
     768    29249358 :     XLogRecord *rechdr = (XLogRecord *) rdata->data;
     769    29249358 :     uint8       info = rechdr->xl_info & ~XLR_INFO_MASK;
     770    29249358 :     WalInsertClass class = WALINSERT_NORMAL;
     771             :     XLogRecPtr  StartPos;
     772             :     XLogRecPtr  EndPos;
     773    29249358 :     bool        prevDoPageWrites = doPageWrites;
     774             :     TimeLineID  insertTLI;
     775             : 
     776             :     /* Does this record type require special handling? */
     777    29249358 :     if (unlikely(rechdr->xl_rmid == RM_XLOG_ID))
     778             :     {
     779      440934 :         if (info == XLOG_SWITCH)
     780        1470 :             class = WALINSERT_SPECIAL_SWITCH;
     781      439464 :         else if (info == XLOG_CHECKPOINT_REDO)
     782        1800 :             class = WALINSERT_SPECIAL_CHECKPOINT;
     783             :     }
     784             : 
     785             :     /* we assume that all of the record header is in the first chunk */
     786             :     Assert(rdata->len >= SizeOfXLogRecord);
     787             : 
     788             :     /* cross-check on whether we should be here or not */
     789    29249358 :     if (!XLogInsertAllowed())
     790           0 :         elog(ERROR, "cannot make new WAL entries during recovery");
     791             : 
     792             :     /*
     793             :      * Given that we're not in recovery, InsertTimeLineID is set and can't
     794             :      * change, so we can read it without a lock.
     795             :      */
     796    29249358 :     insertTLI = XLogCtl->InsertTimeLineID;
     797             : 
     798             :     /*----------
     799             :      *
     800             :      * We have now done all the preparatory work we can without holding a
     801             :      * lock or modifying shared state. From here on, inserting the new WAL
     802             :      * record to the shared WAL buffer cache is a two-step process:
     803             :      *
     804             :      * 1. Reserve the right amount of space from the WAL. The current head of
     805             :      *    reserved space is kept in Insert->CurrBytePos, and is protected by
     806             :      *    insertpos_lck.
     807             :      *
     808             :      * 2. Copy the record to the reserved WAL space. This involves finding the
     809             :      *    correct WAL buffer containing the reserved space, and copying the
     810             :      *    record in place. This can be done concurrently in multiple processes.
     811             :      *
     812             :      * To keep track of which insertions are still in-progress, each concurrent
     813             :      * inserter acquires an insertion lock. In addition to just indicating that
     814             :      * an insertion is in progress, the lock tells others how far the inserter
     815             :      * has progressed. There is a small fixed number of insertion locks,
     816             :      * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
     817             :      * boundary, it updates the value stored in the lock to the how far it has
     818             :      * inserted, to allow the previous buffer to be flushed.
     819             :      *
     820             :      * Holding onto an insertion lock also protects RedoRecPtr and
     821             :      * fullPageWrites from changing until the insertion is finished.
     822             :      *
     823             :      * Step 2 can usually be done completely in parallel. If the required WAL
     824             :      * page is not initialized yet, you have to go through AdvanceXLInsertBuffer,
     825             :      * which will ensure it is initialized. But the WAL writer tries to do that
     826             :      * ahead of insertions to avoid that from happening in the critical path.
     827             :      *
     828             :      *----------
     829             :      */
     830    29249358 :     START_CRIT_SECTION();
     831             : 
     832    29249358 :     if (likely(class == WALINSERT_NORMAL))
     833             :     {
     834    29246088 :         WALInsertLockAcquire();
     835             : 
     836             :         /*
     837             :          * Check to see if my copy of RedoRecPtr is out of date. If so, may
     838             :          * have to go back and have the caller recompute everything. This can
     839             :          * only happen just after a checkpoint, so it's better to be slow in
     840             :          * this case and fast otherwise.
     841             :          *
     842             :          * Also check to see if fullPageWrites was just turned on or there's a
     843             :          * running backup (which forces full-page writes); if we weren't
     844             :          * already doing full-page writes then go back and recompute.
     845             :          *
     846             :          * If we aren't doing full-page writes then RedoRecPtr doesn't
     847             :          * actually affect the contents of the XLOG record, so we'll update
     848             :          * our local copy but not force a recomputation.  (If doPageWrites was
     849             :          * just turned off, we could recompute the record without full pages,
     850             :          * but we choose not to bother.)
     851             :          */
     852    29246088 :         if (RedoRecPtr != Insert->RedoRecPtr)
     853             :         {
     854             :             Assert(RedoRecPtr < Insert->RedoRecPtr);
     855       13542 :             RedoRecPtr = Insert->RedoRecPtr;
     856             :         }
     857    29246088 :         doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0);
     858             : 
     859    29246088 :         if (doPageWrites &&
     860    28762062 :             (!prevDoPageWrites ||
     861    26398284 :              (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
     862             :         {
     863             :             /*
     864             :              * Oops, some buffer now needs to be backed up that the caller
     865             :              * didn't back up.  Start over.
     866             :              */
     867       14836 :             WALInsertLockRelease();
     868       14836 :             END_CRIT_SECTION();
     869       14836 :             return InvalidXLogRecPtr;
     870             :         }
     871             : 
     872             :         /*
     873             :          * Reserve space for the record in the WAL. This also sets the xl_prev
     874             :          * pointer.
     875             :          */
     876    29231252 :         ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
     877             :                                   &rechdr->xl_prev);
     878             : 
     879             :         /* Normal records are always inserted. */
     880    29231252 :         inserted = true;
     881             :     }
     882        3270 :     else if (class == WALINSERT_SPECIAL_SWITCH)
     883             :     {
     884             :         /*
     885             :          * In order to insert an XLOG_SWITCH record, we need to hold all of
     886             :          * the WAL insertion locks, not just one, so that no one else can
     887             :          * begin inserting a record until we've figured out how much space
     888             :          * remains in the current WAL segment and claimed all of it.
     889             :          *
     890             :          * Nonetheless, this case is simpler than the normal cases handled
     891             :          * below, which must check for changes in doPageWrites and RedoRecPtr.
     892             :          * Those checks are only needed for records that can contain buffer
     893             :          * references, and an XLOG_SWITCH record never does.
     894             :          */
     895             :         Assert(fpw_lsn == InvalidXLogRecPtr);
     896        1470 :         WALInsertLockAcquireExclusive();
     897        1470 :         inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
     898             :     }
     899             :     else
     900             :     {
     901             :         Assert(class == WALINSERT_SPECIAL_CHECKPOINT);
     902             : 
     903             :         /*
     904             :          * We need to update both the local and shared copies of RedoRecPtr,
     905             :          * which means that we need to hold all the WAL insertion locks.
     906             :          * However, there can't be any buffer references, so as above, we need
     907             :          * not check RedoRecPtr before inserting the record; we just need to
     908             :          * update it afterwards.
     909             :          */
     910             :         Assert(fpw_lsn == InvalidXLogRecPtr);
     911        1800 :         WALInsertLockAcquireExclusive();
     912        1800 :         ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
     913             :                                   &rechdr->xl_prev);
     914        1800 :         RedoRecPtr = Insert->RedoRecPtr = StartPos;
     915        1800 :         inserted = true;
     916             :     }
     917             : 
     918    29234522 :     if (inserted)
     919             :     {
     920             :         /*
     921             :          * Now that xl_prev has been filled in, calculate CRC of the record
     922             :          * header.
     923             :          */
     924    29234406 :         rdata_crc = rechdr->xl_crc;
     925    29234406 :         COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
     926    29234406 :         FIN_CRC32C(rdata_crc);
     927    29234406 :         rechdr->xl_crc = rdata_crc;
     928             : 
     929             :         /*
     930             :          * All the record data, including the header, is now ready to be
     931             :          * inserted. Copy the record in the space reserved.
     932             :          */
     933    29234406 :         CopyXLogRecordToWAL(rechdr->xl_tot_len,
     934             :                             class == WALINSERT_SPECIAL_SWITCH, rdata,
     935             :                             StartPos, EndPos, insertTLI);
     936             : 
     937             :         /*
     938             :          * Unless record is flagged as not important, update LSN of last
     939             :          * important record in the current slot. When holding all locks, just
     940             :          * update the first one.
     941             :          */
     942    29234406 :         if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
     943             :         {
     944    29054990 :             int         lockno = holdingAllLocks ? 0 : MyLockNo;
     945             : 
     946    29054990 :             WALInsertLocks[lockno].l.lastImportantAt = StartPos;
     947             :         }
     948             :     }
     949             :     else
     950             :     {
     951             :         /*
     952             :          * This was an xlog-switch record, but the current insert location was
     953             :          * already exactly at the beginning of a segment, so there was no need
     954             :          * to do anything.
     955             :          */
     956             :     }
     957             : 
     958             :     /*
     959             :      * Done! Let others know that we're finished.
     960             :      */
     961    29234522 :     WALInsertLockRelease();
     962             : 
     963    29234522 :     END_CRIT_SECTION();
     964             : 
     965    29234522 :     MarkCurrentTransactionIdLoggedIfAny();
     966             : 
     967             :     /*
     968             :      * Mark top transaction id is logged (if needed) so that we should not try
     969             :      * to log it again with the next WAL record in the current subtransaction.
     970             :      */
     971    29234522 :     if (topxid_included)
     972         438 :         MarkSubxactTopXidLogged();
     973             : 
     974             :     /*
     975             :      * Update shared LogwrtRqst.Write, if we crossed page boundary.
     976             :      */
     977    29234522 :     if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
     978             :     {
     979     3311116 :         SpinLockAcquire(&XLogCtl->info_lck);
     980             :         /* advance global request to include new block(s) */
     981     3311116 :         if (XLogCtl->LogwrtRqst.Write < EndPos)
     982     3200340 :             XLogCtl->LogwrtRqst.Write = EndPos;
     983     3311116 :         SpinLockRelease(&XLogCtl->info_lck);
     984     3311116 :         RefreshXLogWriteResult(LogwrtResult);
     985             :     }
     986             : 
     987             :     /*
     988             :      * If this was an XLOG_SWITCH record, flush the record and the empty
     989             :      * padding space that fills the rest of the segment, and perform
     990             :      * end-of-segment actions (eg, notifying archiver).
     991             :      */
     992    29234522 :     if (class == WALINSERT_SPECIAL_SWITCH)
     993             :     {
     994             :         TRACE_POSTGRESQL_WAL_SWITCH();
     995        1470 :         XLogFlush(EndPos);
     996             : 
     997             :         /*
     998             :          * Even though we reserved the rest of the segment for us, which is
     999             :          * reflected in EndPos, we return a pointer to just the end of the
    1000             :          * xlog-switch record.
    1001             :          */
    1002        1470 :         if (inserted)
    1003             :         {
    1004        1354 :             EndPos = StartPos + SizeOfXLogRecord;
    1005        1354 :             if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
    1006             :             {
    1007           2 :                 uint64      offset = XLogSegmentOffset(EndPos, wal_segment_size);
    1008             : 
    1009           2 :                 if (offset == EndPos % XLOG_BLCKSZ)
    1010           0 :                     EndPos += SizeOfXLogLongPHD;
    1011             :                 else
    1012           2 :                     EndPos += SizeOfXLogShortPHD;
    1013             :             }
    1014             :         }
    1015             :     }
    1016             : 
    1017             : #ifdef WAL_DEBUG
    1018             :     if (XLOG_DEBUG)
    1019             :     {
    1020             :         static XLogReaderState *debug_reader = NULL;
    1021             :         XLogRecord *record;
    1022             :         DecodedXLogRecord *decoded;
    1023             :         StringInfoData buf;
    1024             :         StringInfoData recordBuf;
    1025             :         char       *errormsg = NULL;
    1026             :         MemoryContext oldCxt;
    1027             : 
    1028             :         oldCxt = MemoryContextSwitchTo(walDebugCxt);
    1029             : 
    1030             :         initStringInfo(&buf);
    1031             :         appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
    1032             : 
    1033             :         /*
    1034             :          * We have to piece together the WAL record data from the XLogRecData
    1035             :          * entries, so that we can pass it to the rm_desc function as one
    1036             :          * contiguous chunk.
    1037             :          */
    1038             :         initStringInfo(&recordBuf);
    1039             :         for (; rdata != NULL; rdata = rdata->next)
    1040             :             appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
    1041             : 
    1042             :         /* We also need temporary space to decode the record. */
    1043             :         record = (XLogRecord *) recordBuf.data;
    1044             :         decoded = (DecodedXLogRecord *)
    1045             :             palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
    1046             : 
    1047             :         if (!debug_reader)
    1048             :             debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
    1049             :                                               XL_ROUTINE(.page_read = NULL,
    1050             :                                                          .segment_open = NULL,
    1051             :                                                          .segment_close = NULL),
    1052             :                                               NULL);
    1053             :         if (!debug_reader)
    1054             :         {
    1055             :             appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
    1056             :         }
    1057             :         else if (!DecodeXLogRecord(debug_reader,
    1058             :                                    decoded,
    1059             :                                    record,
    1060             :                                    EndPos,
    1061             :                                    &errormsg))
    1062             :         {
    1063             :             appendStringInfo(&buf, "error decoding record: %s",
    1064             :                              errormsg ? errormsg : "no error message");
    1065             :         }
    1066             :         else
    1067             :         {
    1068             :             appendStringInfoString(&buf, " - ");
    1069             : 
    1070             :             debug_reader->record = decoded;
    1071             :             xlog_outdesc(&buf, debug_reader);
    1072             :             debug_reader->record = NULL;
    1073             :         }
    1074             :         elog(LOG, "%s", buf.data);
    1075             : 
    1076             :         pfree(decoded);
    1077             :         pfree(buf.data);
    1078             :         pfree(recordBuf.data);
    1079             :         MemoryContextSwitchTo(oldCxt);
    1080             :     }
    1081             : #endif
    1082             : 
    1083             :     /*
    1084             :      * Update our global variables
    1085             :      */
    1086    29234522 :     ProcLastRecPtr = StartPos;
    1087    29234522 :     XactLastRecEnd = EndPos;
    1088             : 
    1089             :     /* Report WAL traffic to the instrumentation. */
    1090    29234522 :     if (inserted)
    1091             :     {
    1092    29234406 :         pgWalUsage.wal_bytes += rechdr->xl_tot_len;
    1093    29234406 :         pgWalUsage.wal_records++;
    1094    29234406 :         pgWalUsage.wal_fpi += num_fpi;
    1095             :     }
    1096             : 
    1097    29234522 :     return EndPos;
    1098             : }
    1099             : 
    1100             : /*
    1101             :  * Reserves the right amount of space for a record of given size from the WAL.
    1102             :  * *StartPos is set to the beginning of the reserved section, *EndPos to
    1103             :  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
    1104             :  * used to set the xl_prev of this record.
    1105             :  *
    1106             :  * This is the performance critical part of XLogInsert that must be serialized
    1107             :  * across backends. The rest can happen mostly in parallel. Try to keep this
    1108             :  * section as short as possible, insertpos_lck can be heavily contended on a
    1109             :  * busy system.
    1110             :  *
    1111             :  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
    1112             :  * where we actually copy the record to the reserved space.
    1113             :  *
    1114             :  * NB: Testing shows that XLogInsertRecord runs faster if this code is inlined;
    1115             :  * however, because there are two call sites, the compiler is reluctant to
    1116             :  * inline. We use pg_attribute_always_inline here to try to convince it.
    1117             :  */
    1118             : static pg_attribute_always_inline void
    1119    29233052 : ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
    1120             :                           XLogRecPtr *PrevPtr)
    1121             : {
    1122    29233052 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1123             :     uint64      startbytepos;
    1124             :     uint64      endbytepos;
    1125             :     uint64      prevbytepos;
    1126             : 
    1127    29233052 :     size = MAXALIGN(size);
    1128             : 
    1129             :     /* All (non xlog-switch) records should contain data. */
    1130             :     Assert(size > SizeOfXLogRecord);
    1131             : 
    1132             :     /*
    1133             :      * The duration the spinlock needs to be held is minimized by minimizing
    1134             :      * the calculations that have to be done while holding the lock. The
    1135             :      * current tip of reserved WAL is kept in CurrBytePos, as a byte position
    1136             :      * that only counts "usable" bytes in WAL, that is, it excludes all WAL
    1137             :      * page headers. The mapping between "usable" byte positions and physical
    1138             :      * positions (XLogRecPtrs) can be done outside the locked region, and
    1139             :      * because the usable byte position doesn't include any headers, reserving
    1140             :      * X bytes from WAL is almost as simple as "CurrBytePos += X".
    1141             :      */
    1142    29233052 :     SpinLockAcquire(&Insert->insertpos_lck);
    1143             : 
    1144    29233052 :     startbytepos = Insert->CurrBytePos;
    1145    29233052 :     endbytepos = startbytepos + size;
    1146    29233052 :     prevbytepos = Insert->PrevBytePos;
    1147    29233052 :     Insert->CurrBytePos = endbytepos;
    1148    29233052 :     Insert->PrevBytePos = startbytepos;
    1149             : 
    1150    29233052 :     SpinLockRelease(&Insert->insertpos_lck);
    1151             : 
    1152    29233052 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1153    29233052 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1154    29233052 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1155             : 
    1156             :     /*
    1157             :      * Check that the conversions between "usable byte positions" and
    1158             :      * XLogRecPtrs work consistently in both directions.
    1159             :      */
    1160             :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1161             :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1162             :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1163    29233052 : }
    1164             : 
    1165             : /*
    1166             :  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
    1167             :  *
    1168             :  * A log-switch record is handled slightly differently. The rest of the
    1169             :  * segment will be reserved for this insertion, as indicated by the returned
    1170             :  * *EndPos value. However, if we are already at the beginning of the current
    1171             :  * segment, *StartPos and *EndPos are set to the current location without
    1172             :  * reserving any space, and the function returns false.
    1173             : */
    1174             : static bool
    1175        1470 : ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
    1176             : {
    1177        1470 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1178             :     uint64      startbytepos;
    1179             :     uint64      endbytepos;
    1180             :     uint64      prevbytepos;
    1181        1470 :     uint32      size = MAXALIGN(SizeOfXLogRecord);
    1182             :     XLogRecPtr  ptr;
    1183             :     uint32      segleft;
    1184             : 
    1185             :     /*
    1186             :      * These calculations are a bit heavy-weight to be done while holding a
    1187             :      * spinlock, but since we're holding all the WAL insertion locks, there
    1188             :      * are no other inserters competing for it. GetXLogInsertRecPtr() does
    1189             :      * compete for it, but that's not called very frequently.
    1190             :      */
    1191        1470 :     SpinLockAcquire(&Insert->insertpos_lck);
    1192             : 
    1193        1470 :     startbytepos = Insert->CurrBytePos;
    1194             : 
    1195        1470 :     ptr = XLogBytePosToEndRecPtr(startbytepos);
    1196        1470 :     if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
    1197             :     {
    1198         116 :         SpinLockRelease(&Insert->insertpos_lck);
    1199         116 :         *EndPos = *StartPos = ptr;
    1200         116 :         return false;
    1201             :     }
    1202             : 
    1203        1354 :     endbytepos = startbytepos + size;
    1204        1354 :     prevbytepos = Insert->PrevBytePos;
    1205             : 
    1206        1354 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1207        1354 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1208             : 
    1209        1354 :     segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
    1210        1354 :     if (segleft != wal_segment_size)
    1211             :     {
    1212             :         /* consume the rest of the segment */
    1213        1354 :         *EndPos += segleft;
    1214        1354 :         endbytepos = XLogRecPtrToBytePos(*EndPos);
    1215             :     }
    1216        1354 :     Insert->CurrBytePos = endbytepos;
    1217        1354 :     Insert->PrevBytePos = startbytepos;
    1218             : 
    1219        1354 :     SpinLockRelease(&Insert->insertpos_lck);
    1220             : 
    1221        1354 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1222             : 
    1223             :     Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
    1224             :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1225             :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1226             :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1227             : 
    1228        1354 :     return true;
    1229             : }
    1230             : 
    1231             : /*
    1232             :  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
    1233             :  * area in the WAL.
    1234             :  */
    1235             : static void
    1236    29234406 : CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
    1237             :                     XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
    1238             : {
    1239             :     char       *currpos;
    1240             :     int         freespace;
    1241             :     int         written;
    1242             :     XLogRecPtr  CurrPos;
    1243             :     XLogPageHeader pagehdr;
    1244             : 
    1245             :     /*
    1246             :      * Get a pointer to the right place in the right WAL buffer to start
    1247             :      * inserting to.
    1248             :      */
    1249    29234406 :     CurrPos = StartPos;
    1250    29234406 :     currpos = GetXLogBuffer(CurrPos, tli);
    1251    29234406 :     freespace = INSERT_FREESPACE(CurrPos);
    1252             : 
    1253             :     /*
    1254             :      * there should be enough space for at least the first field (xl_tot_len)
    1255             :      * on this page.
    1256             :      */
    1257             :     Assert(freespace >= sizeof(uint32));
    1258             : 
    1259             :     /* Copy record data */
    1260    29234406 :     written = 0;
    1261   138033430 :     while (rdata != NULL)
    1262             :     {
    1263   108799024 :         const char *rdata_data = rdata->data;
    1264   108799024 :         int         rdata_len = rdata->len;
    1265             : 
    1266   112348976 :         while (rdata_len > freespace)
    1267             :         {
    1268             :             /*
    1269             :              * Write what fits on this page, and continue on the next page.
    1270             :              */
    1271             :             Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
    1272     3549952 :             memcpy(currpos, rdata_data, freespace);
    1273     3549952 :             rdata_data += freespace;
    1274     3549952 :             rdata_len -= freespace;
    1275     3549952 :             written += freespace;
    1276     3549952 :             CurrPos += freespace;
    1277             : 
    1278             :             /*
    1279             :              * Get pointer to beginning of next page, and set the xlp_rem_len
    1280             :              * in the page header. Set XLP_FIRST_IS_CONTRECORD.
    1281             :              *
    1282             :              * It's safe to set the contrecord flag and xlp_rem_len without a
    1283             :              * lock on the page. All the other flags were already set when the
    1284             :              * page was initialized, in AdvanceXLInsertBuffer, and we're the
    1285             :              * only backend that needs to set the contrecord flag.
    1286             :              */
    1287     3549952 :             currpos = GetXLogBuffer(CurrPos, tli);
    1288     3549952 :             pagehdr = (XLogPageHeader) currpos;
    1289     3549952 :             pagehdr->xlp_rem_len = write_len - written;
    1290     3549952 :             pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
    1291             : 
    1292             :             /* skip over the page header */
    1293     3549952 :             if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
    1294             :             {
    1295        2532 :                 CurrPos += SizeOfXLogLongPHD;
    1296        2532 :                 currpos += SizeOfXLogLongPHD;
    1297             :             }
    1298             :             else
    1299             :             {
    1300     3547420 :                 CurrPos += SizeOfXLogShortPHD;
    1301     3547420 :                 currpos += SizeOfXLogShortPHD;
    1302             :             }
    1303     3549952 :             freespace = INSERT_FREESPACE(CurrPos);
    1304             :         }
    1305             : 
    1306             :         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
    1307   108799024 :         memcpy(currpos, rdata_data, rdata_len);
    1308   108799024 :         currpos += rdata_len;
    1309   108799024 :         CurrPos += rdata_len;
    1310   108799024 :         freespace -= rdata_len;
    1311   108799024 :         written += rdata_len;
    1312             : 
    1313   108799024 :         rdata = rdata->next;
    1314             :     }
    1315             :     Assert(written == write_len);
    1316             : 
    1317             :     /*
    1318             :      * If this was an xlog-switch, it's not enough to write the switch record,
    1319             :      * we also have to consume all the remaining space in the WAL segment.  We
    1320             :      * have already reserved that space, but we need to actually fill it.
    1321             :      */
    1322    29234406 :     if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
    1323             :     {
    1324             :         /* An xlog-switch record doesn't contain any data besides the header */
    1325             :         Assert(write_len == SizeOfXLogRecord);
    1326             : 
    1327             :         /* Assert that we did reserve the right amount of space */
    1328             :         Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
    1329             : 
    1330             :         /* Use up all the remaining space on the current page */
    1331        1354 :         CurrPos += freespace;
    1332             : 
    1333             :         /*
    1334             :          * Cause all remaining pages in the segment to be flushed, leaving the
    1335             :          * XLog position where it should be, at the start of the next segment.
    1336             :          * We do this one page at a time, to make sure we don't deadlock
    1337             :          * against ourselves if wal_buffers < wal_segment_size.
    1338             :          */
    1339     1207534 :         while (CurrPos < EndPos)
    1340             :         {
    1341             :             /*
    1342             :              * The minimal action to flush the page would be to call
    1343             :              * WALInsertLockUpdateInsertingAt(CurrPos) followed by
    1344             :              * AdvanceXLInsertBuffer(...).  The page would be left initialized
    1345             :              * mostly to zeros, except for the page header (always the short
    1346             :              * variant, as this is never a segment's first page).
    1347             :              *
    1348             :              * The large vistas of zeros are good for compressibility, but the
    1349             :              * headers interrupting them every XLOG_BLCKSZ (with values that
    1350             :              * differ from page to page) are not.  The effect varies with
    1351             :              * compression tool, but bzip2 for instance compresses about an
    1352             :              * order of magnitude worse if those headers are left in place.
    1353             :              *
    1354             :              * Rather than complicating AdvanceXLInsertBuffer itself (which is
    1355             :              * called in heavily-loaded circumstances as well as this lightly-
    1356             :              * loaded one) with variant behavior, we just use GetXLogBuffer
    1357             :              * (which itself calls the two methods we need) to get the pointer
    1358             :              * and zero most of the page.  Then we just zero the page header.
    1359             :              */
    1360     1206180 :             currpos = GetXLogBuffer(CurrPos, tli);
    1361     4824720 :             MemSet(currpos, 0, SizeOfXLogShortPHD);
    1362             : 
    1363     1206180 :             CurrPos += XLOG_BLCKSZ;
    1364             :         }
    1365             :     }
    1366             :     else
    1367             :     {
    1368             :         /* Align the end position, so that the next record starts aligned */
    1369    29233052 :         CurrPos = MAXALIGN64(CurrPos);
    1370             :     }
    1371             : 
    1372    29234406 :     if (CurrPos != EndPos)
    1373           0 :         ereport(PANIC,
    1374             :                 errcode(ERRCODE_DATA_CORRUPTED),
    1375             :                 errmsg_internal("space reserved for WAL record does not match what was written"));
    1376    29234406 : }
    1377             : 
    1378             : /*
    1379             :  * Acquire a WAL insertion lock, for inserting to WAL.
    1380             :  */
    1381             : static void
    1382    29246108 : WALInsertLockAcquire(void)
    1383             : {
    1384             :     bool        immed;
    1385             : 
    1386             :     /*
    1387             :      * It doesn't matter which of the WAL insertion locks we acquire, so try
    1388             :      * the one we used last time.  If the system isn't particularly busy, it's
    1389             :      * a good bet that it's still available, and it's good to have some
    1390             :      * affinity to a particular lock so that you don't unnecessarily bounce
    1391             :      * cache lines between processes when there's no contention.
    1392             :      *
    1393             :      * If this is the first time through in this backend, pick a lock
    1394             :      * (semi-)randomly.  This allows the locks to be used evenly if you have a
    1395             :      * lot of very short connections.
    1396             :      */
    1397             :     static int  lockToTry = -1;
    1398             : 
    1399    29246108 :     if (lockToTry == -1)
    1400       15494 :         lockToTry = MyProcNumber % NUM_XLOGINSERT_LOCKS;
    1401    29246108 :     MyLockNo = lockToTry;
    1402             : 
    1403             :     /*
    1404             :      * The insertingAt value is initially set to 0, as we don't know our
    1405             :      * insert location yet.
    1406             :      */
    1407    29246108 :     immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
    1408    29246108 :     if (!immed)
    1409             :     {
    1410             :         /*
    1411             :          * If we couldn't get the lock immediately, try another lock next
    1412             :          * time.  On a system with more insertion locks than concurrent
    1413             :          * inserters, this causes all the inserters to eventually migrate to a
    1414             :          * lock that no-one else is using.  On a system with more inserters
    1415             :          * than locks, it still helps to distribute the inserters evenly
    1416             :          * across the locks.
    1417             :          */
    1418       38164 :         lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
    1419             :     }
    1420    29246108 : }
    1421             : 
    1422             : /*
    1423             :  * Acquire all WAL insertion locks, to prevent other backends from inserting
    1424             :  * to WAL.
    1425             :  */
    1426             : static void
    1427        8380 : WALInsertLockAcquireExclusive(void)
    1428             : {
    1429             :     int         i;
    1430             : 
    1431             :     /*
    1432             :      * When holding all the locks, all but the last lock's insertingAt
    1433             :      * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
    1434             :      * XLogRecPtr value, to make sure that no-one blocks waiting on those.
    1435             :      */
    1436       67040 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
    1437             :     {
    1438       58660 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1439       58660 :         LWLockUpdateVar(&WALInsertLocks[i].l.lock,
    1440       58660 :                         &WALInsertLocks[i].l.insertingAt,
    1441             :                         PG_UINT64_MAX);
    1442             :     }
    1443             :     /* Variable value reset to 0 at release */
    1444        8380 :     LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1445             : 
    1446        8380 :     holdingAllLocks = true;
    1447        8380 : }
    1448             : 
    1449             : /*
    1450             :  * Release our insertion lock (or locks, if we're holding them all).
    1451             :  *
    1452             :  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
    1453             :  * next time the lock is acquired.
    1454             :  */
    1455             : static void
    1456    29254488 : WALInsertLockRelease(void)
    1457             : {
    1458    29254488 :     if (holdingAllLocks)
    1459             :     {
    1460             :         int         i;
    1461             : 
    1462       75420 :         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1463       67040 :             LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
    1464       67040 :                                   &WALInsertLocks[i].l.insertingAt,
    1465             :                                   0);
    1466             : 
    1467        8380 :         holdingAllLocks = false;
    1468             :     }
    1469             :     else
    1470             :     {
    1471    29246108 :         LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
    1472    29246108 :                               &WALInsertLocks[MyLockNo].l.insertingAt,
    1473             :                               0);
    1474             :     }
    1475    29254488 : }
    1476             : 
    1477             : /*
    1478             :  * Update our insertingAt value, to let others know that we've finished
    1479             :  * inserting up to that point.
    1480             :  */
    1481             : static void
    1482     4759434 : WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
    1483             : {
    1484     4759434 :     if (holdingAllLocks)
    1485             :     {
    1486             :         /*
    1487             :          * We use the last lock to mark our actual position, see comments in
    1488             :          * WALInsertLockAcquireExclusive.
    1489             :          */
    1490     1203618 :         LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
    1491     1203618 :                         &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
    1492             :                         insertingAt);
    1493             :     }
    1494             :     else
    1495     3555816 :         LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
    1496     3555816 :                         &WALInsertLocks[MyLockNo].l.insertingAt,
    1497             :                         insertingAt);
    1498     4759434 : }
    1499             : 
    1500             : /*
    1501             :  * Wait for any WAL insertions < upto to finish.
    1502             :  *
    1503             :  * Returns the location of the oldest insertion that is still in-progress.
    1504             :  * Any WAL prior to that point has been fully copied into WAL buffers, and
    1505             :  * can be flushed out to disk. Because this waits for any insertions older
    1506             :  * than 'upto' to finish, the return value is always >= 'upto'.
    1507             :  *
    1508             :  * Note: When you are about to write out WAL, you must call this function
    1509             :  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
    1510             :  * need to wait for an insertion to finish (or at least advance to next
    1511             :  * uninitialized page), and the inserter might need to evict an old WAL buffer
    1512             :  * to make room for a new one, which in turn requires WALWriteLock.
    1513             :  */
    1514             : static XLogRecPtr
    1515     4090448 : WaitXLogInsertionsToFinish(XLogRecPtr upto)
    1516             : {
    1517             :     uint64      bytepos;
    1518             :     XLogRecPtr  inserted;
    1519             :     XLogRecPtr  reservedUpto;
    1520             :     XLogRecPtr  finishedUpto;
    1521     4090448 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1522             :     int         i;
    1523             : 
    1524     4090448 :     if (MyProc == NULL)
    1525           0 :         elog(PANIC, "cannot wait without a PGPROC structure");
    1526             : 
    1527             :     /*
    1528             :      * Check if there's any work to do.  Use a barrier to ensure we get the
    1529             :      * freshest value.
    1530             :      */
    1531     4090448 :     inserted = pg_atomic_read_membarrier_u64(&XLogCtl->logInsertResult);
    1532     4090448 :     if (upto <= inserted)
    1533     3305282 :         return inserted;
    1534             : 
    1535             :     /* Read the current insert position */
    1536      785166 :     SpinLockAcquire(&Insert->insertpos_lck);
    1537      785166 :     bytepos = Insert->CurrBytePos;
    1538      785166 :     SpinLockRelease(&Insert->insertpos_lck);
    1539      785166 :     reservedUpto = XLogBytePosToEndRecPtr(bytepos);
    1540             : 
    1541             :     /*
    1542             :      * No-one should request to flush a piece of WAL that hasn't even been
    1543             :      * reserved yet. However, it can happen if there is a block with a bogus
    1544             :      * LSN on disk, for example. XLogFlush checks for that situation and
    1545             :      * complains, but only after the flush. Here we just assume that to mean
    1546             :      * that all WAL that has been reserved needs to be finished. In this
    1547             :      * corner-case, the return value can be smaller than 'upto' argument.
    1548             :      */
    1549      785166 :     if (upto > reservedUpto)
    1550             :     {
    1551           0 :         ereport(LOG,
    1552             :                 (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
    1553             :                         LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
    1554           0 :         upto = reservedUpto;
    1555             :     }
    1556             : 
    1557             :     /*
    1558             :      * Loop through all the locks, sleeping on any in-progress insert older
    1559             :      * than 'upto'.
    1560             :      *
    1561             :      * finishedUpto is our return value, indicating the point upto which all
    1562             :      * the WAL insertions have been finished. Initialize it to the head of
    1563             :      * reserved WAL, and as we iterate through the insertion locks, back it
    1564             :      * out for any insertion that's still in progress.
    1565             :      */
    1566      785166 :     finishedUpto = reservedUpto;
    1567     7066494 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1568             :     {
    1569     6281328 :         XLogRecPtr  insertingat = InvalidXLogRecPtr;
    1570             : 
    1571             :         do
    1572             :         {
    1573             :             /*
    1574             :              * See if this insertion is in progress.  LWLockWaitForVar will
    1575             :              * wait for the lock to be released, or for the 'value' to be set
    1576             :              * by a LWLockUpdateVar call.  When a lock is initially acquired,
    1577             :              * its value is 0 (InvalidXLogRecPtr), which means that we don't
    1578             :              * know where it's inserting yet.  We will have to wait for it. If
    1579             :              * it's a small insertion, the record will most likely fit on the
    1580             :              * same page and the inserter will release the lock without ever
    1581             :              * calling LWLockUpdateVar.  But if it has to sleep, it will
    1582             :              * advertise the insertion point with LWLockUpdateVar before
    1583             :              * sleeping.
    1584             :              *
    1585             :              * In this loop we are only waiting for insertions that started
    1586             :              * before WaitXLogInsertionsToFinish was called.  The lack of
    1587             :              * memory barriers in the loop means that we might see locks as
    1588             :              * "unused" that have since become used.  This is fine because
    1589             :              * they only can be used for later insertions that we would not
    1590             :              * want to wait on anyway.  Not taking a lock to acquire the
    1591             :              * current insertingAt value means that we might see older
    1592             :              * insertingAt values.  This is also fine, because if we read a
    1593             :              * value too old, we will add ourselves to the wait queue, which
    1594             :              * contains atomic operations.
    1595             :              */
    1596     6393716 :             if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
    1597     6393716 :                                  &WALInsertLocks[i].l.insertingAt,
    1598             :                                  insertingat, &insertingat))
    1599             :             {
    1600             :                 /* the lock was free, so no insertion in progress */
    1601     4680066 :                 insertingat = InvalidXLogRecPtr;
    1602     4680066 :                 break;
    1603             :             }
    1604             : 
    1605             :             /*
    1606             :              * This insertion is still in progress. Have to wait, unless the
    1607             :              * inserter has proceeded past 'upto'.
    1608             :              */
    1609     1713650 :         } while (insertingat < upto);
    1610             : 
    1611     6281328 :         if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
    1612      604294 :             finishedUpto = insertingat;
    1613             :     }
    1614             : 
    1615             :     /*
    1616             :      * Advance the limit we know to have been inserted and return the freshest
    1617             :      * value we know of, which might be beyond what we requested if somebody
    1618             :      * is concurrently doing this with an 'upto' pointer ahead of us.
    1619             :      */
    1620      785166 :     finishedUpto = pg_atomic_monotonic_advance_u64(&XLogCtl->logInsertResult,
    1621             :                                                    finishedUpto);
    1622             : 
    1623      785166 :     return finishedUpto;
    1624             : }
    1625             : 
    1626             : /*
    1627             :  * Get a pointer to the right location in the WAL buffer containing the
    1628             :  * given XLogRecPtr.
    1629             :  *
    1630             :  * If the page is not initialized yet, it is initialized. That might require
    1631             :  * evicting an old dirty buffer from the buffer cache, which means I/O.
    1632             :  *
    1633             :  * The caller must ensure that the page containing the requested location
    1634             :  * isn't evicted yet, and won't be evicted. The way to ensure that is to
    1635             :  * hold onto a WAL insertion lock with the insertingAt position set to
    1636             :  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
    1637             :  * to evict an old page from the buffer. (This means that once you call
    1638             :  * GetXLogBuffer() with a given 'ptr', you must not access anything before
    1639             :  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
    1640             :  * later, because older buffers might be recycled already)
    1641             :  */
    1642             : static char *
    1643    33990558 : GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
    1644             : {
    1645             :     int         idx;
    1646             :     XLogRecPtr  endptr;
    1647             :     static uint64 cachedPage = 0;
    1648             :     static char *cachedPos = NULL;
    1649             :     XLogRecPtr  expectedEndPtr;
    1650             : 
    1651             :     /*
    1652             :      * Fast path for the common case that we need to access again the same
    1653             :      * page as last time.
    1654             :      */
    1655    33990558 :     if (ptr / XLOG_BLCKSZ == cachedPage)
    1656             :     {
    1657             :         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1658             :         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1659    28262786 :         return cachedPos + ptr % XLOG_BLCKSZ;
    1660             :     }
    1661             : 
    1662             :     /*
    1663             :      * The XLog buffer cache is organized so that a page is always loaded to a
    1664             :      * particular buffer.  That way we can easily calculate the buffer a given
    1665             :      * page must be loaded into, from the XLogRecPtr alone.
    1666             :      */
    1667     5727772 :     idx = XLogRecPtrToBufIdx(ptr);
    1668             : 
    1669             :     /*
    1670             :      * See what page is loaded in the buffer at the moment. It could be the
    1671             :      * page we're looking for, or something older. It can't be anything newer
    1672             :      * - that would imply the page we're looking for has already been written
    1673             :      * out to disk and evicted, and the caller is responsible for making sure
    1674             :      * that doesn't happen.
    1675             :      *
    1676             :      * We don't hold a lock while we read the value. If someone is just about
    1677             :      * to initialize or has just initialized the page, it's possible that we
    1678             :      * get InvalidXLogRecPtr. That's ok, we'll grab the mapping lock (in
    1679             :      * AdvanceXLInsertBuffer) and retry if we see anything other than the page
    1680             :      * we're looking for.
    1681             :      */
    1682     5727772 :     expectedEndPtr = ptr;
    1683     5727772 :     expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
    1684             : 
    1685     5727772 :     endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1686     5727772 :     if (expectedEndPtr != endptr)
    1687             :     {
    1688             :         XLogRecPtr  initializedUpto;
    1689             : 
    1690             :         /*
    1691             :          * Before calling AdvanceXLInsertBuffer(), which can block, let others
    1692             :          * know how far we're finished with inserting the record.
    1693             :          *
    1694             :          * NB: If 'ptr' points to just after the page header, advertise a
    1695             :          * position at the beginning of the page rather than 'ptr' itself. If
    1696             :          * there are no other insertions running, someone might try to flush
    1697             :          * up to our advertised location. If we advertised a position after
    1698             :          * the page header, someone might try to flush the page header, even
    1699             :          * though page might actually not be initialized yet. As the first
    1700             :          * inserter on the page, we are effectively responsible for making
    1701             :          * sure that it's initialized, before we let insertingAt to move past
    1702             :          * the page header.
    1703             :          */
    1704     4759434 :         if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
    1705       11444 :             XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
    1706       11444 :             initializedUpto = ptr - SizeOfXLogShortPHD;
    1707     4747990 :         else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
    1708        1964 :                  XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
    1709        1128 :             initializedUpto = ptr - SizeOfXLogLongPHD;
    1710             :         else
    1711     4746862 :             initializedUpto = ptr;
    1712             : 
    1713     4759434 :         WALInsertLockUpdateInsertingAt(initializedUpto);
    1714             : 
    1715     4759434 :         AdvanceXLInsertBuffer(ptr, tli, false);
    1716     4759434 :         endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1717             : 
    1718     4759434 :         if (expectedEndPtr != endptr)
    1719           0 :             elog(PANIC, "could not find WAL buffer for %X/%X",
    1720             :                  LSN_FORMAT_ARGS(ptr));
    1721             :     }
    1722             :     else
    1723             :     {
    1724             :         /*
    1725             :          * Make sure the initialization of the page is visible to us, and
    1726             :          * won't arrive later to overwrite the WAL data we write on the page.
    1727             :          */
    1728      968338 :         pg_memory_barrier();
    1729             :     }
    1730             : 
    1731             :     /*
    1732             :      * Found the buffer holding this page. Return a pointer to the right
    1733             :      * offset within the page.
    1734             :      */
    1735     5727772 :     cachedPage = ptr / XLOG_BLCKSZ;
    1736     5727772 :     cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
    1737             : 
    1738             :     Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1739             :     Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1740             : 
    1741     5727772 :     return cachedPos + ptr % XLOG_BLCKSZ;
    1742             : }
    1743             : 
    1744             : /*
    1745             :  * Read WAL data directly from WAL buffers, if available. Returns the number
    1746             :  * of bytes read successfully.
    1747             :  *
    1748             :  * Fewer than 'count' bytes may be read if some of the requested WAL data has
    1749             :  * already been evicted.
    1750             :  *
    1751             :  * No locks are taken.
    1752             :  *
    1753             :  * Caller should ensure that it reads no further than LogwrtResult.Write
    1754             :  * (which should have been updated by the caller when determining how far to
    1755             :  * read). The 'tli' argument is only used as a convenient safety check so that
    1756             :  * callers do not read from WAL buffers on a historical timeline.
    1757             :  */
    1758             : Size
    1759      201266 : WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count,
    1760             :                    TimeLineID tli)
    1761             : {
    1762      201266 :     char       *pdst = dstbuf;
    1763      201266 :     XLogRecPtr  recptr = startptr;
    1764             :     XLogRecPtr  inserted;
    1765      201266 :     Size        nbytes = count;
    1766             : 
    1767      201266 :     if (RecoveryInProgress() || tli != GetWALInsertionTimeLine())
    1768        1836 :         return 0;
    1769             : 
    1770             :     Assert(!XLogRecPtrIsInvalid(startptr));
    1771             : 
    1772             :     /*
    1773             :      * Caller should ensure that the requested data has been inserted into WAL
    1774             :      * buffers before we try to read it.
    1775             :      */
    1776      199430 :     inserted = pg_atomic_read_u64(&XLogCtl->logInsertResult);
    1777      199430 :     if (startptr + count > inserted)
    1778           0 :         ereport(ERROR,
    1779             :                 errmsg("cannot read past end of generated WAL: requested %X/%X, current position %X/%X",
    1780             :                        LSN_FORMAT_ARGS(startptr + count),
    1781             :                        LSN_FORMAT_ARGS(inserted)));
    1782             : 
    1783             :     /*
    1784             :      * Loop through the buffers without a lock. For each buffer, atomically
    1785             :      * read and verify the end pointer, then copy the data out, and finally
    1786             :      * re-read and re-verify the end pointer.
    1787             :      *
    1788             :      * Once a page is evicted, it never returns to the WAL buffers, so if the
    1789             :      * end pointer matches the expected end pointer before and after we copy
    1790             :      * the data, then the right page must have been present during the data
    1791             :      * copy. Read barriers are necessary to ensure that the data copy actually
    1792             :      * happens between the two verification steps.
    1793             :      *
    1794             :      * If either verification fails, we simply terminate the loop and return
    1795             :      * with the data that had been already copied out successfully.
    1796             :      */
    1797      226806 :     while (nbytes > 0)
    1798             :     {
    1799      218984 :         uint32      offset = recptr % XLOG_BLCKSZ;
    1800      218984 :         int         idx = XLogRecPtrToBufIdx(recptr);
    1801             :         XLogRecPtr  expectedEndPtr;
    1802             :         XLogRecPtr  endptr;
    1803             :         const char *page;
    1804             :         const char *psrc;
    1805             :         Size        npagebytes;
    1806             : 
    1807             :         /*
    1808             :          * Calculate the end pointer we expect in the xlblocks array if the
    1809             :          * correct page is present.
    1810             :          */
    1811      218984 :         expectedEndPtr = recptr + (XLOG_BLCKSZ - offset);
    1812             : 
    1813             :         /*
    1814             :          * First verification step: check that the correct page is present in
    1815             :          * the WAL buffers.
    1816             :          */
    1817      218984 :         endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1818      218984 :         if (expectedEndPtr != endptr)
    1819      191608 :             break;
    1820             : 
    1821             :         /*
    1822             :          * The correct page is present (or was at the time the endptr was
    1823             :          * read; must re-verify later). Calculate pointer to source data and
    1824             :          * determine how much data to read from this page.
    1825             :          */
    1826       27376 :         page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
    1827       27376 :         psrc = page + offset;
    1828       27376 :         npagebytes = Min(nbytes, XLOG_BLCKSZ - offset);
    1829             : 
    1830             :         /*
    1831             :          * Ensure that the data copy and the first verification step are not
    1832             :          * reordered.
    1833             :          */
    1834       27376 :         pg_read_barrier();
    1835             : 
    1836             :         /* data copy */
    1837       27376 :         memcpy(pdst, psrc, npagebytes);
    1838             : 
    1839             :         /*
    1840             :          * Ensure that the data copy and the second verification step are not
    1841             :          * reordered.
    1842             :          */
    1843       27376 :         pg_read_barrier();
    1844             : 
    1845             :         /*
    1846             :          * Second verification step: check that the page we read from wasn't
    1847             :          * evicted while we were copying the data.
    1848             :          */
    1849       27376 :         endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1850       27376 :         if (expectedEndPtr != endptr)
    1851           0 :             break;
    1852             : 
    1853       27376 :         pdst += npagebytes;
    1854       27376 :         recptr += npagebytes;
    1855       27376 :         nbytes -= npagebytes;
    1856             :     }
    1857             : 
    1858             :     Assert(pdst - dstbuf <= count);
    1859             : 
    1860      199430 :     return pdst - dstbuf;
    1861             : }
    1862             : 
    1863             : /*
    1864             :  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
    1865             :  * is the position starting from the beginning of WAL, excluding all WAL
    1866             :  * page headers.
    1867             :  */
    1868             : static XLogRecPtr
    1869    58473954 : XLogBytePosToRecPtr(uint64 bytepos)
    1870             : {
    1871             :     uint64      fullsegs;
    1872             :     uint64      fullpages;
    1873             :     uint64      bytesleft;
    1874             :     uint32      seg_offset;
    1875             :     XLogRecPtr  result;
    1876             : 
    1877    58473954 :     fullsegs = bytepos / UsableBytesInSegment;
    1878    58473954 :     bytesleft = bytepos % UsableBytesInSegment;
    1879             : 
    1880    58473954 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    1881             :     {
    1882             :         /* fits on first page of segment */
    1883      102362 :         seg_offset = bytesleft + SizeOfXLogLongPHD;
    1884             :     }
    1885             :     else
    1886             :     {
    1887             :         /* account for the first page on segment with long header */
    1888    58371592 :         seg_offset = XLOG_BLCKSZ;
    1889    58371592 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    1890             : 
    1891    58371592 :         fullpages = bytesleft / UsableBytesInPage;
    1892    58371592 :         bytesleft = bytesleft % UsableBytesInPage;
    1893             : 
    1894    58371592 :         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    1895             :     }
    1896             : 
    1897    58473954 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
    1898             : 
    1899    58473954 :     return result;
    1900             : }
    1901             : 
    1902             : /*
    1903             :  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
    1904             :  * returns a pointer to the beginning of the page (ie. before page header),
    1905             :  * not to where the first xlog record on that page would go to. This is used
    1906             :  * when converting a pointer to the end of a record.
    1907             :  */
    1908             : static XLogRecPtr
    1909    30021042 : XLogBytePosToEndRecPtr(uint64 bytepos)
    1910             : {
    1911             :     uint64      fullsegs;
    1912             :     uint64      fullpages;
    1913             :     uint64      bytesleft;
    1914             :     uint32      seg_offset;
    1915             :     XLogRecPtr  result;
    1916             : 
    1917    30021042 :     fullsegs = bytepos / UsableBytesInSegment;
    1918    30021042 :     bytesleft = bytepos % UsableBytesInSegment;
    1919             : 
    1920    30021042 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    1921             :     {
    1922             :         /* fits on first page of segment */
    1923      160912 :         if (bytesleft == 0)
    1924      107150 :             seg_offset = 0;
    1925             :         else
    1926       53762 :             seg_offset = bytesleft + SizeOfXLogLongPHD;
    1927             :     }
    1928             :     else
    1929             :     {
    1930             :         /* account for the first page on segment with long header */
    1931    29860130 :         seg_offset = XLOG_BLCKSZ;
    1932    29860130 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    1933             : 
    1934    29860130 :         fullpages = bytesleft / UsableBytesInPage;
    1935    29860130 :         bytesleft = bytesleft % UsableBytesInPage;
    1936             : 
    1937    29860130 :         if (bytesleft == 0)
    1938       28756 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
    1939             :         else
    1940    29831374 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    1941             :     }
    1942             : 
    1943    30021042 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
    1944             : 
    1945    30021042 :     return result;
    1946             : }
    1947             : 
    1948             : /*
    1949             :  * Convert an XLogRecPtr to a "usable byte position".
    1950             :  */
    1951             : static uint64
    1952        4810 : XLogRecPtrToBytePos(XLogRecPtr ptr)
    1953             : {
    1954             :     uint64      fullsegs;
    1955             :     uint32      fullpages;
    1956             :     uint32      offset;
    1957             :     uint64      result;
    1958             : 
    1959        4810 :     XLByteToSeg(ptr, fullsegs, wal_segment_size);
    1960             : 
    1961        4810 :     fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
    1962        4810 :     offset = ptr % XLOG_BLCKSZ;
    1963             : 
    1964        4810 :     if (fullpages == 0)
    1965             :     {
    1966        1878 :         result = fullsegs * UsableBytesInSegment;
    1967        1878 :         if (offset > 0)
    1968             :         {
    1969             :             Assert(offset >= SizeOfXLogLongPHD);
    1970         490 :             result += offset - SizeOfXLogLongPHD;
    1971             :         }
    1972             :     }
    1973             :     else
    1974             :     {
    1975        2932 :         result = fullsegs * UsableBytesInSegment +
    1976        2932 :             (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
    1977        2932 :             (fullpages - 1) * UsableBytesInPage;    /* full pages */
    1978        2932 :         if (offset > 0)
    1979             :         {
    1980             :             Assert(offset >= SizeOfXLogShortPHD);
    1981        2914 :             result += offset - SizeOfXLogShortPHD;
    1982             :         }
    1983             :     }
    1984             : 
    1985        4810 :     return result;
    1986             : }
    1987             : 
    1988             : /*
    1989             :  * Initialize XLOG buffers, writing out old buffers if they still contain
    1990             :  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
    1991             :  * true, initialize as many pages as we can without having to write out
    1992             :  * unwritten data. Any new pages are initialized to zeros, with pages headers
    1993             :  * initialized properly.
    1994             :  */
    1995             : static void
    1996     4768792 : AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
    1997             : {
    1998     4768792 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1999             :     int         nextidx;
    2000             :     XLogRecPtr  OldPageRqstPtr;
    2001             :     XLogwrtRqst WriteRqst;
    2002     4768792 :     XLogRecPtr  NewPageEndPtr = InvalidXLogRecPtr;
    2003             :     XLogRecPtr  NewPageBeginPtr;
    2004             :     XLogPageHeader NewPage;
    2005             :     XLogRecPtr  ReservedPtr;
    2006     4768792 :     int         npages pg_attribute_unused() = 0;
    2007             : 
    2008             :     /*
    2009             :      * We must run the loop below inside the critical section as we expect
    2010             :      * XLogCtl->InitializedUpTo to eventually keep up.  The most of callers
    2011             :      * already run inside the critical section. Except for WAL writer, which
    2012             :      * passed 'opportunistic == true', and therefore we don't perform
    2013             :      * operations that could error out.
    2014             :      *
    2015             :      * Start an explicit critical section anyway though.
    2016             :      */
    2017             :     Assert(CritSectionCount > 0 || opportunistic);
    2018     4768792 :     START_CRIT_SECTION();
    2019             : 
    2020             :     /*--
    2021             :      * Loop till we get all the pages in WAL buffer before 'upto' reserved for
    2022             :      * initialization.  Multiple process can initialize different buffers with
    2023             :      * this loop in parallel as following.
    2024             :      *
    2025             :      * 1. Reserve page for initialization using XLogCtl->InitializeReserved.
    2026             :      * 2. Initialize the reserved page.
    2027             :      * 3. Attempt to advance XLogCtl->InitializedUpTo,
    2028             :      */
    2029     4768792 :     ReservedPtr = pg_atomic_read_u64(&XLogCtl->InitializeReserved);
    2030    14526600 :     while (upto >= ReservedPtr || opportunistic)
    2031             :     {
    2032             :         Assert(ReservedPtr % XLOG_BLCKSZ == 0);
    2033             : 
    2034             :         /*
    2035             :          * Get ending-offset of the buffer page we need to replace.
    2036             :          *
    2037             :          * We don't lookup into xlblocks, but rather calculate position we
    2038             :          * must wait to be written. If it was written, xlblocks will have this
    2039             :          * position (or uninitialized)
    2040             :          */
    2041     9767166 :         if (ReservedPtr + XLOG_BLCKSZ > XLogCtl->InitializedFrom + XLOG_BLCKSZ * XLOGbuffers)
    2042     9180966 :             OldPageRqstPtr = ReservedPtr + XLOG_BLCKSZ - (XLogRecPtr) XLOG_BLCKSZ * XLOGbuffers;
    2043             :         else
    2044      586200 :             OldPageRqstPtr = InvalidXLogRecPtr;
    2045             : 
    2046     9767166 :         if (LogwrtResult.Write < OldPageRqstPtr && opportunistic)
    2047             :         {
    2048             :             /*
    2049             :              * If we just want to pre-initialize as much as we can without
    2050             :              * flushing, give up now.
    2051             :              */
    2052        9358 :             upto = ReservedPtr - 1;
    2053        9358 :             break;
    2054             :         }
    2055             : 
    2056             :         /*
    2057             :          * Attempt to reserve the page for initialization.  Failure means that
    2058             :          * this page got reserved by another process.
    2059             :          */
    2060     9757808 :         if (!pg_atomic_compare_exchange_u64(&XLogCtl->InitializeReserved,
    2061             :                                             &ReservedPtr,
    2062             :                                             ReservedPtr + XLOG_BLCKSZ))
    2063     4881358 :             continue;
    2064             : 
    2065             :         /*
    2066             :          * Wait till page gets correctly initialized up to OldPageRqstPtr.
    2067             :          */
    2068     4876450 :         nextidx = XLogRecPtrToBufIdx(ReservedPtr);
    2069     4876826 :         while (pg_atomic_read_u64(&XLogCtl->InitializedUpTo) < OldPageRqstPtr)
    2070         376 :             ConditionVariableSleep(&XLogCtl->InitializedUpToCondVar, WAIT_EVENT_WAL_BUFFER_INIT);
    2071     4876450 :         ConditionVariableCancelSleep();
    2072             :         Assert(pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]) == OldPageRqstPtr);
    2073             : 
    2074             :         /* Fall through if it's already written out. */
    2075     4876450 :         if (LogwrtResult.Write < OldPageRqstPtr)
    2076             :         {
    2077             :             /* Nope, got work to do. */
    2078             : 
    2079             :             /* Advance shared memory write request position */
    2080     3834994 :             SpinLockAcquire(&XLogCtl->info_lck);
    2081     3834994 :             if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
    2082     1110000 :                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
    2083     3834994 :             SpinLockRelease(&XLogCtl->info_lck);
    2084             : 
    2085             :             /*
    2086             :              * Acquire an up-to-date LogwrtResult value and see if we still
    2087             :              * need to write it or if someone else already did.
    2088             :              */
    2089     3834994 :             RefreshXLogWriteResult(LogwrtResult);
    2090     3834994 :             if (LogwrtResult.Write < OldPageRqstPtr)
    2091             :             {
    2092     3814612 :                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
    2093             : 
    2094     3814612 :                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    2095             : 
    2096     3814612 :                 RefreshXLogWriteResult(LogwrtResult);
    2097     3814612 :                 if (LogwrtResult.Write >= OldPageRqstPtr)
    2098             :                 {
    2099             :                     /* OK, someone wrote it already */
    2100       48442 :                     LWLockRelease(WALWriteLock);
    2101             :                 }
    2102             :                 else
    2103             :                 {
    2104             :                     /* Have to write it ourselves */
    2105             :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
    2106     3766170 :                     WriteRqst.Write = OldPageRqstPtr;
    2107     3766170 :                     WriteRqst.Flush = 0;
    2108     3766170 :                     XLogWrite(WriteRqst, tli, false);
    2109     3766170 :                     LWLockRelease(WALWriteLock);
    2110     3766170 :                     pgWalUsage.wal_buffers_full++;
    2111             :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
    2112             :                 }
    2113             :             }
    2114             :         }
    2115             : 
    2116             :         /*
    2117             :          * Now the next buffer slot is free and we can set it up to be the
    2118             :          * next output page.
    2119             :          */
    2120     4876450 :         NewPageBeginPtr = ReservedPtr;
    2121     4876450 :         NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
    2122             : 
    2123     4876450 :         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
    2124             : 
    2125             :         /*
    2126             :          * Mark the xlblock with InvalidXLogRecPtr and issue a write barrier
    2127             :          * before initializing. Otherwise, the old page may be partially
    2128             :          * zeroed but look valid.
    2129             :          */
    2130     4876450 :         pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], InvalidXLogRecPtr);
    2131     4876450 :         pg_write_barrier();
    2132             : 
    2133             :         /*
    2134             :          * Be sure to re-zero the buffer so that bytes beyond what we've
    2135             :          * written will look like zeroes and not valid XLOG records...
    2136             :          */
    2137     4876450 :         MemSet(NewPage, 0, XLOG_BLCKSZ);
    2138             : 
    2139             :         /*
    2140             :          * Fill the new page's header
    2141             :          */
    2142     4876450 :         NewPage->xlp_magic = XLOG_PAGE_MAGIC;
    2143             : 
    2144             :         /* NewPage->xlp_info = 0; */ /* done by memset */
    2145     4876450 :         NewPage->xlp_tli = tli;
    2146     4876450 :         NewPage->xlp_pageaddr = NewPageBeginPtr;
    2147             : 
    2148             :         /* NewPage->xlp_rem_len = 0; */  /* done by memset */
    2149             : 
    2150             :         /*
    2151             :          * If online backup is not in progress, mark the header to indicate
    2152             :          * that WAL records beginning in this page have removable backup
    2153             :          * blocks.  This allows the WAL archiver to know whether it is safe to
    2154             :          * compress archived WAL data by transforming full-block records into
    2155             :          * the non-full-block format.  It is sufficient to record this at the
    2156             :          * page level because we force a page switch (in fact a segment
    2157             :          * switch) when starting a backup, so the flag will be off before any
    2158             :          * records can be written during the backup.  At the end of a backup,
    2159             :          * the last page will be marked as all unsafe when perhaps only part
    2160             :          * is unsafe, but at worst the archiver would miss the opportunity to
    2161             :          * compress a few records.
    2162             :          */
    2163     4876450 :         if (Insert->runningBackups == 0)
    2164     4626268 :             NewPage->xlp_info |= XLP_BKP_REMOVABLE;
    2165             : 
    2166             :         /*
    2167             :          * If first page of an XLOG segment file, make it a long header.
    2168             :          */
    2169     4876450 :         if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
    2170             :         {
    2171        3658 :             XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
    2172             : 
    2173        3658 :             NewLongPage->xlp_sysid = ControlFile->system_identifier;
    2174        3658 :             NewLongPage->xlp_seg_size = wal_segment_size;
    2175        3658 :             NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    2176        3658 :             NewPage->xlp_info |= XLP_LONG_HEADER;
    2177             :         }
    2178             : 
    2179             :         /*
    2180             :          * Make sure the initialization of the page becomes visible to others
    2181             :          * before the xlblocks update. GetXLogBuffer() reads xlblocks without
    2182             :          * holding a lock.
    2183             :          */
    2184     4876450 :         pg_write_barrier();
    2185             : 
    2186             :         /*-----
    2187             :          * Update the value of XLogCtl->xlblocks[nextidx] and try to advance
    2188             :          * XLogCtl->InitializedUpTo in a lock-less manner.
    2189             :          *
    2190             :          * First, let's provide a formal proof of the algorithm.  Let it be 'n'
    2191             :          * process with the following variables in shared memory:
    2192             :          *  f - an array of 'n' boolean flags,
    2193             :          *  v - atomic integer variable.
    2194             :          *
    2195             :          * Also, let
    2196             :          *  i - a number of a process,
    2197             :          *  j - local integer variable,
    2198             :          * CAS(var, oldval, newval) - compare-and-swap atomic operation
    2199             :          *                            returning true on success,
    2200             :          * write_barrier()/read_barrier() - memory barriers.
    2201             :          *
    2202             :          * The pseudocode for each process is the following.
    2203             :          *
    2204             :          *  j := i
    2205             :          *  f[i] := true
    2206             :          *  write_barrier()
    2207             :          *  while CAS(v, j, j + 1):
    2208             :          *      j := j + 1
    2209             :          *      read_barrier()
    2210             :          *      if not f[j]:
    2211             :          *          break
    2212             :          *
    2213             :          * Let's prove that v eventually reaches the value of n.
    2214             :          * 1. Prove by contradiction.  Assume v doesn't reach n and stucks
    2215             :          *    on k, where k < n.
    2216             :          * 2. Process k attempts CAS(v, k, k + 1).  1). If, as we assumed, v
    2217             :          *    gets stuck at k, then this CAS operation must fail.  Therefore,
    2218             :          *    v < k when process k attempts CAS(v, k, k + 1).
    2219             :          * 3. If, as we assumed, v gets stuck at k, then the value k of v
    2220             :          *    must be achieved by some process m, where m < k.  The process
    2221             :          *    m must observe f[k] == false.  Otherwise, it will later attempt
    2222             :          *    CAS(v, k, k + 1) with success.
    2223             :          * 4. Therefore, corresponding read_barrier() (while j == k) on
    2224             :          *    process m reached before write_barrier() of process k.  But then
    2225             :          *    process k attempts CAS(v, k, k + 1) after process m successfully
    2226             :          *    incremented v to k, and that CAS operation must succeed.
    2227             :          *    That leads to a contradiction.  So, there is no such k (k < n)
    2228             :          *    where v gets stuck.  Q.E.D.
    2229             :          *
    2230             :          * To apply this proof to the code below, we assume
    2231             :          * XLogCtl->InitializedUpTo will play the role of v with XLOG_BLCKSZ
    2232             :          * granularity.  We also assume setting XLogCtl->xlblocks[nextidx] to
    2233             :          * NewPageEndPtr to play the role of setting f[i] to true.  Also, note
    2234             :          * that processes can't concurrently map different xlog locations to
    2235             :          * the same nextidx because we previously requested that
    2236             :          * XLogCtl->InitializedUpTo >= OldPageRqstPtr.  So, a xlog buffer can
    2237             :          * be taken for initialization only once the previous initialization
    2238             :          * takes effect on XLogCtl->InitializedUpTo.
    2239             :          */
    2240             : 
    2241     4876450 :         pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], NewPageEndPtr);
    2242             : 
    2243     4876450 :         pg_write_barrier();
    2244             : 
    2245     4943212 :         while (pg_atomic_compare_exchange_u64(&XLogCtl->InitializedUpTo, &NewPageBeginPtr, NewPageEndPtr))
    2246             :         {
    2247     4884752 :             NewPageBeginPtr = NewPageEndPtr;
    2248     4884752 :             NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
    2249     4884752 :             nextidx = XLogRecPtrToBufIdx(NewPageBeginPtr);
    2250             : 
    2251     4884752 :             pg_read_barrier();
    2252             : 
    2253     4884752 :             if (pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]) != NewPageEndPtr)
    2254             :             {
    2255             :                 /*
    2256             :                  * Page at nextidx wasn't initialized yet, so we can't move
    2257             :                  * InitializedUpto further. It will be moved by backend which
    2258             :                  * will initialize nextidx.
    2259             :                  */
    2260     4817990 :                 ConditionVariableBroadcast(&XLogCtl->InitializedUpToCondVar);
    2261     4817990 :                 break;
    2262             :             }
    2263             :         }
    2264             : 
    2265     4876450 :         npages++;
    2266             :     }
    2267             : 
    2268     4768792 :     END_CRIT_SECTION();
    2269             : 
    2270             :     /*
    2271             :      * All the pages in WAL buffer before 'upto' were reserved for
    2272             :      * initialization.  However, some pages might be reserved by concurrent
    2273             :      * processes.  Wait till they finish initialization.
    2274             :      */
    2275     5559728 :     while (upto >= pg_atomic_read_u64(&XLogCtl->InitializedUpTo))
    2276      790936 :         ConditionVariableSleep(&XLogCtl->InitializedUpToCondVar, WAIT_EVENT_WAL_BUFFER_INIT);
    2277     4768792 :     ConditionVariableCancelSleep();
    2278             : 
    2279     4768792 :     pg_read_barrier();
    2280             : 
    2281             : #ifdef WAL_DEBUG
    2282             :     if (XLOG_DEBUG && npages > 0)
    2283             :     {
    2284             :         elog(DEBUG1, "initialized %d pages, up to %X/%X",
    2285             :              npages, LSN_FORMAT_ARGS(NewPageEndPtr));
    2286             :     }
    2287             : #endif
    2288     4768792 : }
    2289             : 
    2290             : /*
    2291             :  * Calculate CheckPointSegments based on max_wal_size_mb and
    2292             :  * checkpoint_completion_target.
    2293             :  */
    2294             : static void
    2295       14744 : CalculateCheckpointSegments(void)
    2296             : {
    2297             :     double      target;
    2298             : 
    2299             :     /*-------
    2300             :      * Calculate the distance at which to trigger a checkpoint, to avoid
    2301             :      * exceeding max_wal_size_mb. This is based on two assumptions:
    2302             :      *
    2303             :      * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
    2304             :      *    WAL for two checkpoint cycles to allow us to recover from the
    2305             :      *    secondary checkpoint if the first checkpoint failed, though we
    2306             :      *    only did this on the primary anyway, not on standby. Keeping just
    2307             :      *    one checkpoint simplifies processing and reduces disk space in
    2308             :      *    many smaller databases.)
    2309             :      * b) during checkpoint, we consume checkpoint_completion_target *
    2310             :      *    number of segments consumed between checkpoints.
    2311             :      *-------
    2312             :      */
    2313       14744 :     target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
    2314       14744 :         (1.0 + CheckPointCompletionTarget);
    2315             : 
    2316             :     /* round down */
    2317       14744 :     CheckPointSegments = (int) target;
    2318             : 
    2319       14744 :     if (CheckPointSegments < 1)
    2320          20 :         CheckPointSegments = 1;
    2321       14744 : }
    2322             : 
    2323             : void
    2324       10590 : assign_max_wal_size(int newval, void *extra)
    2325             : {
    2326       10590 :     max_wal_size_mb = newval;
    2327       10590 :     CalculateCheckpointSegments();
    2328       10590 : }
    2329             : 
    2330             : void
    2331        2208 : assign_checkpoint_completion_target(double newval, void *extra)
    2332             : {
    2333        2208 :     CheckPointCompletionTarget = newval;
    2334        2208 :     CalculateCheckpointSegments();
    2335        2208 : }
    2336             : 
    2337             : bool
    2338        4258 : check_wal_segment_size(int *newval, void **extra, GucSource source)
    2339             : {
    2340        4258 :     if (!IsValidWalSegSize(*newval))
    2341             :     {
    2342           0 :         GUC_check_errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.");
    2343           0 :         return false;
    2344             :     }
    2345             : 
    2346        4258 :     return true;
    2347             : }
    2348             : 
    2349             : /*
    2350             :  * GUC check_hook for max_slot_wal_keep_size
    2351             :  *
    2352             :  * We don't allow the value of max_slot_wal_keep_size other than -1 during the
    2353             :  * binary upgrade. See start_postmaster() in pg_upgrade for more details.
    2354             :  */
    2355             : bool
    2356        2468 : check_max_slot_wal_keep_size(int *newval, void **extra, GucSource source)
    2357             : {
    2358        2468 :     if (IsBinaryUpgrade && *newval != -1)
    2359             :     {
    2360           0 :         GUC_check_errdetail("\"%s\" must be set to -1 during binary upgrade mode.",
    2361             :                             "max_slot_wal_keep_size");
    2362           0 :         return false;
    2363             :     }
    2364             : 
    2365        2468 :     return true;
    2366             : }
    2367             : 
    2368             : /*
    2369             :  * At a checkpoint, how many WAL segments to recycle as preallocated future
    2370             :  * XLOG segments? Returns the highest segment that should be preallocated.
    2371             :  */
    2372             : static XLogSegNo
    2373        3368 : XLOGfileslop(XLogRecPtr lastredoptr)
    2374             : {
    2375             :     XLogSegNo   minSegNo;
    2376             :     XLogSegNo   maxSegNo;
    2377             :     double      distance;
    2378             :     XLogSegNo   recycleSegNo;
    2379             : 
    2380             :     /*
    2381             :      * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
    2382             :      * correspond to. Always recycle enough segments to meet the minimum, and
    2383             :      * remove enough segments to stay below the maximum.
    2384             :      */
    2385        3368 :     minSegNo = lastredoptr / wal_segment_size +
    2386        3368 :         ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
    2387        3368 :     maxSegNo = lastredoptr / wal_segment_size +
    2388        3368 :         ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
    2389             : 
    2390             :     /*
    2391             :      * Between those limits, recycle enough segments to get us through to the
    2392             :      * estimated end of next checkpoint.
    2393             :      *
    2394             :      * To estimate where the next checkpoint will finish, assume that the
    2395             :      * system runs steadily consuming CheckPointDistanceEstimate bytes between
    2396             :      * every checkpoint.
    2397             :      */
    2398        3368 :     distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
    2399             :     /* add 10% for good measure. */
    2400        3368 :     distance *= 1.10;
    2401             : 
    2402        3368 :     recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
    2403             :                                     wal_segment_size);
    2404             : 
    2405        3368 :     if (recycleSegNo < minSegNo)
    2406        2370 :         recycleSegNo = minSegNo;
    2407        3368 :     if (recycleSegNo > maxSegNo)
    2408         766 :         recycleSegNo = maxSegNo;
    2409             : 
    2410        3368 :     return recycleSegNo;
    2411             : }
    2412             : 
    2413             : /*
    2414             :  * Check whether we've consumed enough xlog space that a checkpoint is needed.
    2415             :  *
    2416             :  * new_segno indicates a log file that has just been filled up (or read
    2417             :  * during recovery). We measure the distance from RedoRecPtr to new_segno
    2418             :  * and see if that exceeds CheckPointSegments.
    2419             :  *
    2420             :  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
    2421             :  */
    2422             : bool
    2423        9472 : XLogCheckpointNeeded(XLogSegNo new_segno)
    2424             : {
    2425             :     XLogSegNo   old_segno;
    2426             : 
    2427        9472 :     XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
    2428             : 
    2429        9472 :     if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
    2430        5836 :         return true;
    2431        3636 :     return false;
    2432             : }
    2433             : 
    2434             : /*
    2435             :  * Write and/or fsync the log at least as far as WriteRqst indicates.
    2436             :  *
    2437             :  * If flexible == true, we don't have to write as far as WriteRqst, but
    2438             :  * may stop at any convenient boundary (such as a cache or logfile boundary).
    2439             :  * This option allows us to avoid uselessly issuing multiple writes when a
    2440             :  * single one would do.
    2441             :  *
    2442             :  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
    2443             :  * must be called before grabbing the lock, to make sure the data is ready to
    2444             :  * write.
    2445             :  */
    2446             : static void
    2447     4031198 : XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
    2448             : {
    2449             :     bool        ispartialpage;
    2450             :     bool        last_iteration;
    2451             :     bool        finishing_seg;
    2452             :     int         curridx;
    2453             :     int         npages;
    2454             :     int         startidx;
    2455             :     uint32      startoffset;
    2456             : 
    2457             :     /* We should always be inside a critical section here */
    2458             :     Assert(CritSectionCount > 0);
    2459             : 
    2460             :     /*
    2461             :      * Update local LogwrtResult (caller probably did this already, but...)
    2462             :      */
    2463     4031198 :     RefreshXLogWriteResult(LogwrtResult);
    2464             : 
    2465             :     /*
    2466             :      * Since successive pages in the xlog cache are consecutively allocated,
    2467             :      * we can usually gather multiple pages together and issue just one
    2468             :      * write() call.  npages is the number of pages we have determined can be
    2469             :      * written together; startidx is the cache block index of the first one,
    2470             :      * and startoffset is the file offset at which it should go. The latter
    2471             :      * two variables are only valid when npages > 0, but we must initialize
    2472             :      * all of them to keep the compiler quiet.
    2473             :      */
    2474     4031198 :     npages = 0;
    2475     4031198 :     startidx = 0;
    2476     4031198 :     startoffset = 0;
    2477             : 
    2478             :     /*
    2479             :      * Within the loop, curridx is the cache block index of the page to
    2480             :      * consider writing.  Begin at the buffer containing the next unwritten
    2481             :      * page, or last partially written page.
    2482             :      */
    2483     4031198 :     curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
    2484             : 
    2485     8792456 :     while (LogwrtResult.Write < WriteRqst.Write)
    2486             :     {
    2487             :         /*
    2488             :          * Make sure we're not ahead of the insert process.  This could happen
    2489             :          * if we're passed a bogus WriteRqst.Write that is past the end of the
    2490             :          * last page that's been initialized by AdvanceXLInsertBuffer.
    2491             :          */
    2492     5017502 :         XLogRecPtr  EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]);
    2493             : 
    2494     5017502 :         if (LogwrtResult.Write >= EndPtr)
    2495           0 :             elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
    2496             :                  LSN_FORMAT_ARGS(LogwrtResult.Write),
    2497             :                  LSN_FORMAT_ARGS(EndPtr));
    2498             : 
    2499             :         /* Advance LogwrtResult.Write to end of current buffer page */
    2500     5017502 :         LogwrtResult.Write = EndPtr;
    2501     5017502 :         ispartialpage = WriteRqst.Write < LogwrtResult.Write;
    2502             : 
    2503     5017502 :         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    2504             :                              wal_segment_size))
    2505             :         {
    2506             :             /*
    2507             :              * Switch to new logfile segment.  We cannot have any pending
    2508             :              * pages here (since we dump what we have at segment end).
    2509             :              */
    2510             :             Assert(npages == 0);
    2511       26330 :             if (openLogFile >= 0)
    2512       12210 :                 XLogFileClose();
    2513       26330 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2514             :                             wal_segment_size);
    2515       26330 :             openLogTLI = tli;
    2516             : 
    2517             :             /* create/use new log file */
    2518       26330 :             openLogFile = XLogFileInit(openLogSegNo, tli);
    2519       26330 :             ReserveExternalFD();
    2520             :         }
    2521             : 
    2522             :         /* Make sure we have the current logfile open */
    2523     5017502 :         if (openLogFile < 0)
    2524             :         {
    2525           0 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2526             :                             wal_segment_size);
    2527           0 :             openLogTLI = tli;
    2528           0 :             openLogFile = XLogFileOpen(openLogSegNo, tli);
    2529           0 :             ReserveExternalFD();
    2530             :         }
    2531             : 
    2532             :         /* Add current page to the set of pending pages-to-dump */
    2533     5017502 :         if (npages == 0)
    2534             :         {
    2535             :             /* first of group */
    2536     4064632 :             startidx = curridx;
    2537     4064632 :             startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
    2538             :                                             wal_segment_size);
    2539             :         }
    2540     5017502 :         npages++;
    2541             : 
    2542             :         /*
    2543             :          * Dump the set if this will be the last loop iteration, or if we are
    2544             :          * at the last page of the cache area (since the next page won't be
    2545             :          * contiguous in memory), or if we are at the end of the logfile
    2546             :          * segment.
    2547             :          */
    2548     5017502 :         last_iteration = WriteRqst.Write <= LogwrtResult.Write;
    2549             : 
    2550     9786236 :         finishing_seg = !ispartialpage &&
    2551     4768734 :             (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
    2552             : 
    2553     5017502 :         if (last_iteration ||
    2554      988016 :             curridx == XLogCtl->XLogCacheBlck ||
    2555             :             finishing_seg)
    2556             :         {
    2557             :             char       *from;
    2558             :             Size        nbytes;
    2559             :             Size        nleft;
    2560             :             ssize_t     written;
    2561             :             instr_time  start;
    2562             : 
    2563             :             /* OK to write the page(s) */
    2564     4064632 :             from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
    2565     4064632 :             nbytes = npages * (Size) XLOG_BLCKSZ;
    2566     4064632 :             nleft = nbytes;
    2567             :             do
    2568             :             {
    2569     4064632 :                 errno = 0;
    2570             : 
    2571             :                 /*
    2572             :                  * Measure I/O timing to write WAL data, for pg_stat_io.
    2573             :                  */
    2574     4064632 :                 start = pgstat_prepare_io_time(track_wal_io_timing);
    2575             : 
    2576     4064632 :                 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
    2577     4064632 :                 written = pg_pwrite(openLogFile, from, nleft, startoffset);
    2578     4064632 :                 pgstat_report_wait_end();
    2579             : 
    2580     4064632 :                 pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL,
    2581             :                                         IOOP_WRITE, start, 1, written);
    2582             : 
    2583     4064632 :                 if (written <= 0)
    2584             :                 {
    2585             :                     char        xlogfname[MAXFNAMELEN];
    2586             :                     int         save_errno;
    2587             : 
    2588           0 :                     if (errno == EINTR)
    2589           0 :                         continue;
    2590             : 
    2591           0 :                     save_errno = errno;
    2592           0 :                     XLogFileName(xlogfname, tli, openLogSegNo,
    2593             :                                  wal_segment_size);
    2594           0 :                     errno = save_errno;
    2595           0 :                     ereport(PANIC,
    2596             :                             (errcode_for_file_access(),
    2597             :                              errmsg("could not write to log file \"%s\" at offset %u, length %zu: %m",
    2598             :                                     xlogfname, startoffset, nleft)));
    2599             :                 }
    2600     4064632 :                 nleft -= written;
    2601     4064632 :                 from += written;
    2602     4064632 :                 startoffset += written;
    2603     4064632 :             } while (nleft > 0);
    2604             : 
    2605     4064632 :             npages = 0;
    2606             : 
    2607             :             /*
    2608             :              * If we just wrote the whole last page of a logfile segment,
    2609             :              * fsync the segment immediately.  This avoids having to go back
    2610             :              * and re-open prior segments when an fsync request comes along
    2611             :              * later. Doing it here ensures that one and only one backend will
    2612             :              * perform this fsync.
    2613             :              *
    2614             :              * This is also the right place to notify the Archiver that the
    2615             :              * segment is ready to copy to archival storage, and to update the
    2616             :              * timer for archive_timeout, and to signal for a checkpoint if
    2617             :              * too many logfile segments have been used since the last
    2618             :              * checkpoint.
    2619             :              */
    2620     4064632 :             if (finishing_seg)
    2621             :             {
    2622        3896 :                 issue_xlog_fsync(openLogFile, openLogSegNo, tli);
    2623             : 
    2624             :                 /* signal that we need to wakeup walsenders later */
    2625        3896 :                 WalSndWakeupRequest();
    2626             : 
    2627        3896 :                 LogwrtResult.Flush = LogwrtResult.Write;    /* end of page */
    2628             : 
    2629        3896 :                 if (XLogArchivingActive())
    2630         812 :                     XLogArchiveNotifySeg(openLogSegNo, tli);
    2631             : 
    2632        3896 :                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    2633        3896 :                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
    2634             : 
    2635             :                 /*
    2636             :                  * Request a checkpoint if we've consumed too much xlog since
    2637             :                  * the last one.  For speed, we first check using the local
    2638             :                  * copy of RedoRecPtr, which might be out of date; if it looks
    2639             :                  * like a checkpoint is needed, forcibly update RedoRecPtr and
    2640             :                  * recheck.
    2641             :                  */
    2642        3896 :                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
    2643             :                 {
    2644         496 :                     (void) GetRedoRecPtr();
    2645         496 :                     if (XLogCheckpointNeeded(openLogSegNo))
    2646         404 :                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
    2647             :                 }
    2648             :             }
    2649             :         }
    2650             : 
    2651     5017502 :         if (ispartialpage)
    2652             :         {
    2653             :             /* Only asked to write a partial page */
    2654      248768 :             LogwrtResult.Write = WriteRqst.Write;
    2655      248768 :             break;
    2656             :         }
    2657     4768734 :         curridx = NextBufIdx(curridx);
    2658             : 
    2659             :         /* If flexible, break out of loop as soon as we wrote something */
    2660     4768734 :         if (flexible && npages == 0)
    2661        7476 :             break;
    2662             :     }
    2663             : 
    2664             :     Assert(npages == 0);
    2665             : 
    2666             :     /*
    2667             :      * If asked to flush, do so
    2668             :      */
    2669     4031198 :     if (LogwrtResult.Flush < WriteRqst.Flush &&
    2670      263632 :         LogwrtResult.Flush < LogwrtResult.Write)
    2671             :     {
    2672             :         /*
    2673             :          * Could get here without iterating above loop, in which case we might
    2674             :          * have no open file or the wrong one.  However, we do not need to
    2675             :          * fsync more than one file.
    2676             :          */
    2677      263496 :         if (wal_sync_method != WAL_SYNC_METHOD_OPEN &&
    2678      263496 :             wal_sync_method != WAL_SYNC_METHOD_OPEN_DSYNC)
    2679             :         {
    2680      263496 :             if (openLogFile >= 0 &&
    2681      263480 :                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    2682             :                                  wal_segment_size))
    2683          50 :                 XLogFileClose();
    2684      263496 :             if (openLogFile < 0)
    2685             :             {
    2686          66 :                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2687             :                                 wal_segment_size);
    2688          66 :                 openLogTLI = tli;
    2689          66 :                 openLogFile = XLogFileOpen(openLogSegNo, tli);
    2690          66 :                 ReserveExternalFD();
    2691             :             }
    2692             : 
    2693      263496 :             issue_xlog_fsync(openLogFile, openLogSegNo, tli);
    2694             :         }
    2695             : 
    2696             :         /* signal that we need to wakeup walsenders later */
    2697      263496 :         WalSndWakeupRequest();
    2698             : 
    2699      263496 :         LogwrtResult.Flush = LogwrtResult.Write;
    2700             :     }
    2701             : 
    2702             :     /*
    2703             :      * Update shared-memory status
    2704             :      *
    2705             :      * We make sure that the shared 'request' values do not fall behind the
    2706             :      * 'result' values.  This is not absolutely essential, but it saves some
    2707             :      * code in a couple of places.
    2708             :      */
    2709     4031198 :     SpinLockAcquire(&XLogCtl->info_lck);
    2710     4031198 :     if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
    2711      232848 :         XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
    2712     4031198 :     if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
    2713      266468 :         XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
    2714     4031198 :     SpinLockRelease(&XLogCtl->info_lck);
    2715             : 
    2716             :     /*
    2717             :      * We write Write first, bar, then Flush.  When reading, the opposite must
    2718             :      * be done (with a matching barrier in between), so that we always see a
    2719             :      * Flush value that trails behind the Write value seen.
    2720             :      */
    2721     4031198 :     pg_atomic_write_u64(&XLogCtl->logWriteResult, LogwrtResult.Write);
    2722     4031198 :     pg_write_barrier();
    2723     4031198 :     pg_atomic_write_u64(&XLogCtl->logFlushResult, LogwrtResult.Flush);
    2724             : 
    2725             : #ifdef USE_ASSERT_CHECKING
    2726             :     {
    2727             :         XLogRecPtr  Flush;
    2728             :         XLogRecPtr  Write;
    2729             :         XLogRecPtr  Insert;
    2730             : 
    2731             :         Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult);
    2732             :         pg_read_barrier();
    2733             :         Write = pg_atomic_read_u64(&XLogCtl->logWriteResult);
    2734             :         pg_read_barrier();
    2735             :         Insert = pg_atomic_read_u64(&XLogCtl->logInsertResult);
    2736             : 
    2737             :         /* WAL written to disk is always ahead of WAL flushed */
    2738             :         Assert(Write >= Flush);
    2739             : 
    2740             :         /* WAL inserted to buffers is always ahead of WAL written */
    2741             :         Assert(Insert >= Write);
    2742             :     }
    2743             : #endif
    2744     4031198 : }
    2745             : 
    2746             : /*
    2747             :  * Record the LSN for an asynchronous transaction commit/abort
    2748             :  * and nudge the WALWriter if there is work for it to do.
    2749             :  * (This should not be called for synchronous commits.)
    2750             :  */
    2751             : void
    2752       97964 : XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
    2753             : {
    2754       97964 :     XLogRecPtr  WriteRqstPtr = asyncXactLSN;
    2755             :     bool        sleeping;
    2756       97964 :     bool        wakeup = false;
    2757             :     XLogRecPtr  prevAsyncXactLSN;
    2758             : 
    2759       97964 :     SpinLockAcquire(&XLogCtl->info_lck);
    2760       97964 :     sleeping = XLogCtl->WalWriterSleeping;
    2761       97964 :     prevAsyncXactLSN = XLogCtl->asyncXactLSN;
    2762       97964 :     if (XLogCtl->asyncXactLSN < asyncXactLSN)
    2763       96980 :         XLogCtl->asyncXactLSN = asyncXactLSN;
    2764       97964 :     SpinLockRelease(&XLogCtl->info_lck);
    2765             : 
    2766             :     /*
    2767             :      * If somebody else already called this function with a more aggressive
    2768             :      * LSN, they will have done what we needed (and perhaps more).
    2769             :      */
    2770       97964 :     if (asyncXactLSN <= prevAsyncXactLSN)
    2771         984 :         return;
    2772             : 
    2773             :     /*
    2774             :      * If the WALWriter is sleeping, kick it to make it come out of low-power
    2775             :      * mode, so that this async commit will reach disk within the expected
    2776             :      * amount of time.  Otherwise, determine whether it has enough WAL
    2777             :      * available to flush, the same way that XLogBackgroundFlush() does.
    2778             :      */
    2779       96980 :     if (sleeping)
    2780          52 :         wakeup = true;
    2781             :     else
    2782             :     {
    2783             :         int         flushblocks;
    2784             : 
    2785       96928 :         RefreshXLogWriteResult(LogwrtResult);
    2786             : 
    2787       96928 :         flushblocks =
    2788       96928 :             WriteRqstPtr / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
    2789             : 
    2790       96928 :         if (WalWriterFlushAfter == 0 || flushblocks >= WalWriterFlushAfter)
    2791        7444 :             wakeup = true;
    2792             :     }
    2793             : 
    2794       96980 :     if (wakeup)
    2795             :     {
    2796        7496 :         volatile PROC_HDR *procglobal = ProcGlobal;
    2797        7496 :         ProcNumber  walwriterProc = procglobal->walwriterProc;
    2798             : 
    2799        7496 :         if (walwriterProc != INVALID_PROC_NUMBER)
    2800         444 :             SetLatch(&GetPGProcByNumber(walwriterProc)->procLatch);
    2801             :     }
    2802             : }
    2803             : 
    2804             : /*
    2805             :  * Record the LSN up to which we can remove WAL because it's not required by
    2806             :  * any replication slot.
    2807             :  */
    2808             : void
    2809       48400 : XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
    2810             : {
    2811       48400 :     SpinLockAcquire(&XLogCtl->info_lck);
    2812       48400 :     XLogCtl->replicationSlotMinLSN = lsn;
    2813       48400 :     SpinLockRelease(&XLogCtl->info_lck);
    2814       48400 : }
    2815             : 
    2816             : 
    2817             : /*
    2818             :  * Return the oldest LSN we must retain to satisfy the needs of some
    2819             :  * replication slot.
    2820             :  */
    2821             : static XLogRecPtr
    2822        4300 : XLogGetReplicationSlotMinimumLSN(void)
    2823             : {
    2824             :     XLogRecPtr  retval;
    2825             : 
    2826        4300 :     SpinLockAcquire(&XLogCtl->info_lck);
    2827        4300 :     retval = XLogCtl->replicationSlotMinLSN;
    2828        4300 :     SpinLockRelease(&XLogCtl->info_lck);
    2829             : 
    2830        4300 :     return retval;
    2831             : }
    2832             : 
    2833             : /*
    2834             :  * Advance minRecoveryPoint in control file.
    2835             :  *
    2836             :  * If we crash during recovery, we must reach this point again before the
    2837             :  * database is consistent.
    2838             :  *
    2839             :  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
    2840             :  * is only updated if it's not already greater than or equal to 'lsn'.
    2841             :  */
    2842             : static void
    2843      212894 : UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
    2844             : {
    2845             :     /* Quick check using our local copy of the variable */
    2846      212894 :     if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
    2847      199746 :         return;
    2848             : 
    2849             :     /*
    2850             :      * An invalid minRecoveryPoint means that we need to recover all the WAL,
    2851             :      * i.e., we're doing crash recovery.  We never modify the control file's
    2852             :      * value in that case, so we can short-circuit future checks here too. The
    2853             :      * local values of minRecoveryPoint and minRecoveryPointTLI should not be
    2854             :      * updated until crash recovery finishes.  We only do this for the startup
    2855             :      * process as it should not update its own reference of minRecoveryPoint
    2856             :      * until it has finished crash recovery to make sure that all WAL
    2857             :      * available is replayed in this case.  This also saves from extra locks
    2858             :      * taken on the control file from the startup process.
    2859             :      */
    2860       13148 :     if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
    2861             :     {
    2862          60 :         updateMinRecoveryPoint = false;
    2863          60 :         return;
    2864             :     }
    2865             : 
    2866       13088 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    2867             : 
    2868             :     /* update local copy */
    2869       13088 :     LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    2870       13088 :     LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    2871             : 
    2872       13088 :     if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
    2873           6 :         updateMinRecoveryPoint = false;
    2874       13082 :     else if (force || LocalMinRecoveryPoint < lsn)
    2875             :     {
    2876             :         XLogRecPtr  newMinRecoveryPoint;
    2877             :         TimeLineID  newMinRecoveryPointTLI;
    2878             : 
    2879             :         /*
    2880             :          * To avoid having to update the control file too often, we update it
    2881             :          * all the way to the last record being replayed, even though 'lsn'
    2882             :          * would suffice for correctness.  This also allows the 'force' case
    2883             :          * to not need a valid 'lsn' value.
    2884             :          *
    2885             :          * Another important reason for doing it this way is that the passed
    2886             :          * 'lsn' value could be bogus, i.e., past the end of available WAL, if
    2887             :          * the caller got it from a corrupted heap page.  Accepting such a
    2888             :          * value as the min recovery point would prevent us from coming up at
    2889             :          * all.  Instead, we just log a warning and continue with recovery.
    2890             :          * (See also the comments about corrupt LSNs in XLogFlush.)
    2891             :          */
    2892       10570 :         newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
    2893       10570 :         if (!force && newMinRecoveryPoint < lsn)
    2894           0 :             elog(WARNING,
    2895             :                  "xlog min recovery request %X/%X is past current point %X/%X",
    2896             :                  LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
    2897             : 
    2898             :         /* update control file */
    2899       10570 :         if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
    2900             :         {
    2901        9872 :             ControlFile->minRecoveryPoint = newMinRecoveryPoint;
    2902        9872 :             ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
    2903        9872 :             UpdateControlFile();
    2904        9872 :             LocalMinRecoveryPoint = newMinRecoveryPoint;
    2905        9872 :             LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
    2906             : 
    2907        9872 :             ereport(DEBUG2,
    2908             :                     (errmsg_internal("updated min recovery point to %X/%X on timeline %u",
    2909             :                                      LSN_FORMAT_ARGS(newMinRecoveryPoint),
    2910             :                                      newMinRecoveryPointTLI)));
    2911             :         }
    2912             :     }
    2913       13088 :     LWLockRelease(ControlFileLock);
    2914             : }
    2915             : 
    2916             : /*
    2917             :  * Ensure that all XLOG data through the given position is flushed to disk.
    2918             :  *
    2919             :  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
    2920             :  * already held, and we try to avoid acquiring it if possible.
    2921             :  */
    2922             : void
    2923     1369768 : XLogFlush(XLogRecPtr record)
    2924             : {
    2925             :     XLogRecPtr  WriteRqstPtr;
    2926             :     XLogwrtRqst WriteRqst;
    2927     1369768 :     TimeLineID  insertTLI = XLogCtl->InsertTimeLineID;
    2928             : 
    2929             :     /*
    2930             :      * During REDO, we are reading not writing WAL.  Therefore, instead of
    2931             :      * trying to flush the WAL, we should update minRecoveryPoint instead. We
    2932             :      * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
    2933             :      * to act this way too, and because when it tries to write the
    2934             :      * end-of-recovery checkpoint, it should indeed flush.
    2935             :      */
    2936     1369768 :     if (!XLogInsertAllowed())
    2937             :     {
    2938      211972 :         UpdateMinRecoveryPoint(record, false);
    2939     1089848 :         return;
    2940             :     }
    2941             : 
    2942             :     /* Quick exit if already known flushed */
    2943     1157796 :     if (record <= LogwrtResult.Flush)
    2944      877876 :         return;
    2945             : 
    2946             : #ifdef WAL_DEBUG
    2947             :     if (XLOG_DEBUG)
    2948             :         elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
    2949             :              LSN_FORMAT_ARGS(record),
    2950             :              LSN_FORMAT_ARGS(LogwrtResult.Write),
    2951             :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    2952             : #endif
    2953             : 
    2954      279920 :     START_CRIT_SECTION();
    2955             : 
    2956             :     /*
    2957             :      * Since fsync is usually a horribly expensive operation, we try to
    2958             :      * piggyback as much data as we can on each fsync: if we see any more data
    2959             :      * entered into the xlog buffer, we'll write and fsync that too, so that
    2960             :      * the final value of LogwrtResult.Flush is as large as possible. This
    2961             :      * gives us some chance of avoiding another fsync immediately after.
    2962             :      */
    2963             : 
    2964             :     /* initialize to given target; may increase below */
    2965      279920 :     WriteRqstPtr = record;
    2966             : 
    2967             :     /*
    2968             :      * Now wait until we get the write lock, or someone else does the flush
    2969             :      * for us.
    2970             :      */
    2971             :     for (;;)
    2972        6690 :     {
    2973             :         XLogRecPtr  insertpos;
    2974             : 
    2975             :         /* done already? */
    2976      286610 :         RefreshXLogWriteResult(LogwrtResult);
    2977      286610 :         if (record <= LogwrtResult.Flush)
    2978       20132 :             break;
    2979             : 
    2980             :         /*
    2981             :          * Before actually performing the write, wait for all in-flight
    2982             :          * insertions to the pages we're about to write to finish.
    2983             :          */
    2984      266478 :         SpinLockAcquire(&XLogCtl->info_lck);
    2985      266478 :         if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
    2986       16082 :             WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
    2987      266478 :         SpinLockRelease(&XLogCtl->info_lck);
    2988      266478 :         insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
    2989             : 
    2990             :         /*
    2991             :          * Try to get the write lock. If we can't get it immediately, wait
    2992             :          * until it's released, and recheck if we still need to do the flush
    2993             :          * or if the backend that held the lock did it for us already. This
    2994             :          * helps to maintain a good rate of group committing when the system
    2995             :          * is bottlenecked by the speed of fsyncing.
    2996             :          */
    2997      266478 :         if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
    2998             :         {
    2999             :             /*
    3000             :              * The lock is now free, but we didn't acquire it yet. Before we
    3001             :              * do, loop back to check if someone else flushed the record for
    3002             :              * us already.
    3003             :              */
    3004        6690 :             continue;
    3005             :         }
    3006             : 
    3007             :         /* Got the lock; recheck whether request is satisfied */
    3008      259788 :         RefreshXLogWriteResult(LogwrtResult);
    3009      259788 :         if (record <= LogwrtResult.Flush)
    3010             :         {
    3011        3892 :             LWLockRelease(WALWriteLock);
    3012        3892 :             break;
    3013             :         }
    3014             : 
    3015             :         /*
    3016             :          * Sleep before flush! By adding a delay here, we may give further
    3017             :          * backends the opportunity to join the backlog of group commit
    3018             :          * followers; this can significantly improve transaction throughput,
    3019             :          * at the risk of increasing transaction latency.
    3020             :          *
    3021             :          * We do not sleep if enableFsync is not turned on, nor if there are
    3022             :          * fewer than CommitSiblings other backends with active transactions.
    3023             :          */
    3024      255896 :         if (CommitDelay > 0 && enableFsync &&
    3025           0 :             MinimumActiveBackends(CommitSiblings))
    3026             :         {
    3027           0 :             pg_usleep(CommitDelay);
    3028             : 
    3029             :             /*
    3030             :              * Re-check how far we can now flush the WAL. It's generally not
    3031             :              * safe to call WaitXLogInsertionsToFinish while holding
    3032             :              * WALWriteLock, because an in-progress insertion might need to
    3033             :              * also grab WALWriteLock to make progress. But we know that all
    3034             :              * the insertions up to insertpos have already finished, because
    3035             :              * that's what the earlier WaitXLogInsertionsToFinish() returned.
    3036             :              * We're only calling it again to allow insertpos to be moved
    3037             :              * further forward, not to actually wait for anyone.
    3038             :              */
    3039           0 :             insertpos = WaitXLogInsertionsToFinish(insertpos);
    3040             :         }
    3041             : 
    3042             :         /* try to write/flush later additions to XLOG as well */
    3043      255896 :         WriteRqst.Write = insertpos;
    3044      255896 :         WriteRqst.Flush = insertpos;
    3045             : 
    3046      255896 :         XLogWrite(WriteRqst, insertTLI, false);
    3047             : 
    3048      255896 :         LWLockRelease(WALWriteLock);
    3049             :         /* done */
    3050      255896 :         break;
    3051             :     }
    3052             : 
    3053      279920 :     END_CRIT_SECTION();
    3054             : 
    3055             :     /* wake up walsenders now that we've released heavily contended locks */
    3056      279920 :     WalSndWakeupProcessRequests(true, !RecoveryInProgress());
    3057             : 
    3058             :     /*
    3059             :      * If we still haven't flushed to the request point then we have a
    3060             :      * problem; most likely, the requested flush point is past end of XLOG.
    3061             :      * This has been seen to occur when a disk page has a corrupted LSN.
    3062             :      *
    3063             :      * Formerly we treated this as a PANIC condition, but that hurts the
    3064             :      * system's robustness rather than helping it: we do not want to take down
    3065             :      * the whole system due to corruption on one data page.  In particular, if
    3066             :      * the bad page is encountered again during recovery then we would be
    3067             :      * unable to restart the database at all!  (This scenario actually
    3068             :      * happened in the field several times with 7.1 releases.)  As of 8.4, bad
    3069             :      * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
    3070             :      * the only time we can reach here during recovery is while flushing the
    3071             :      * end-of-recovery checkpoint record, and we don't expect that to have a
    3072             :      * bad LSN.
    3073             :      *
    3074             :      * Note that for calls from xact.c, the ERROR will be promoted to PANIC
    3075             :      * since xact.c calls this routine inside a critical section.  However,
    3076             :      * calls from bufmgr.c are not within critical sections and so we will not
    3077             :      * force a restart for a bad LSN on a data page.
    3078             :      */
    3079      279920 :     if (LogwrtResult.Flush < record)
    3080           0 :         elog(ERROR,
    3081             :              "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
    3082             :              LSN_FORMAT_ARGS(record),
    3083             :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    3084             : }
    3085             : 
    3086             : /*
    3087             :  * Write & flush xlog, but without specifying exactly where to.
    3088             :  *
    3089             :  * We normally write only completed blocks; but if there is nothing to do on
    3090             :  * that basis, we check for unwritten async commits in the current incomplete
    3091             :  * block, and write through the latest one of those.  Thus, if async commits
    3092             :  * are not being used, we will write complete blocks only.
    3093             :  *
    3094             :  * If, based on the above, there's anything to write we do so immediately. But
    3095             :  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
    3096             :  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
    3097             :  * more than wal_writer_flush_after unflushed blocks.
    3098             :  *
    3099             :  * We can guarantee that async commits reach disk after at most three
    3100             :  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
    3101             :  * to write "flexibly", meaning it can stop at the end of the buffer ring;
    3102             :  * this makes a difference only with very high load or long wal_writer_delay,
    3103             :  * but imposes one extra cycle for the worst case for async commits.)
    3104             :  *
    3105             :  * This routine is invoked periodically by the background walwriter process.
    3106             :  *
    3107             :  * Returns true if there was any work to do, even if we skipped flushing due
    3108             :  * to wal_writer_delay/wal_writer_flush_after.
    3109             :  */
    3110             : bool
    3111       32520 : XLogBackgroundFlush(void)
    3112             : {
    3113             :     XLogwrtRqst WriteRqst;
    3114       32520 :     bool        flexible = true;
    3115             :     static TimestampTz lastflush;
    3116             :     TimestampTz now;
    3117             :     int         flushblocks;
    3118             :     TimeLineID  insertTLI;
    3119             : 
    3120             :     /* XLOG doesn't need flushing during recovery */
    3121       32520 :     if (RecoveryInProgress())
    3122        1136 :         return false;
    3123             : 
    3124             :     /*
    3125             :      * Since we're not in recovery, InsertTimeLineID is set and can't change,
    3126             :      * so we can read it without a lock.
    3127             :      */
    3128       31384 :     insertTLI = XLogCtl->InsertTimeLineID;
    3129             : 
    3130             :     /* read updated LogwrtRqst */
    3131       31384 :     SpinLockAcquire(&XLogCtl->info_lck);
    3132       31384 :     WriteRqst = XLogCtl->LogwrtRqst;
    3133       31384 :     SpinLockRelease(&XLogCtl->info_lck);
    3134             : 
    3135             :     /* back off to last completed page boundary */
    3136       31384 :     WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
    3137             : 
    3138             :     /* if we have already flushed that far, consider async commit records */
    3139       31384 :     RefreshXLogWriteResult(LogwrtResult);
    3140       31384 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    3141             :     {
    3142       23664 :         SpinLockAcquire(&XLogCtl->info_lck);
    3143       23664 :         WriteRqst.Write = XLogCtl->asyncXactLSN;
    3144       23664 :         SpinLockRelease(&XLogCtl->info_lck);
    3145       23664 :         flexible = false;       /* ensure it all gets written */
    3146             :     }
    3147             : 
    3148             :     /*
    3149             :      * If already known flushed, we're done. Just need to check if we are
    3150             :      * holding an open file handle to a logfile that's no longer in use,
    3151             :      * preventing the file from being deleted.
    3152             :      */
    3153       31384 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    3154             :     {
    3155       22026 :         if (openLogFile >= 0)
    3156             :         {
    3157       15856 :             if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    3158             :                                  wal_segment_size))
    3159             :             {
    3160         346 :                 XLogFileClose();
    3161             :             }
    3162             :         }
    3163       22026 :         return false;
    3164             :     }
    3165             : 
    3166             :     /*
    3167             :      * Determine how far to flush WAL, based on the wal_writer_delay and
    3168             :      * wal_writer_flush_after GUCs.
    3169             :      *
    3170             :      * Note that XLogSetAsyncXactLSN() performs similar calculation based on
    3171             :      * wal_writer_flush_after, to decide when to wake us up.  Make sure the
    3172             :      * logic is the same in both places if you change this.
    3173             :      */
    3174        9358 :     now = GetCurrentTimestamp();
    3175        9358 :     flushblocks =
    3176        9358 :         WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
    3177             : 
    3178        9358 :     if (WalWriterFlushAfter == 0 || lastflush == 0)
    3179             :     {
    3180             :         /* first call, or block based limits disabled */
    3181         520 :         WriteRqst.Flush = WriteRqst.Write;
    3182         520 :         lastflush = now;
    3183             :     }
    3184        8838 :     else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
    3185             :     {
    3186             :         /*
    3187             :          * Flush the writes at least every WalWriterDelay ms. This is
    3188             :          * important to bound the amount of time it takes for an asynchronous
    3189             :          * commit to hit disk.
    3190             :          */
    3191        8496 :         WriteRqst.Flush = WriteRqst.Write;
    3192        8496 :         lastflush = now;
    3193             :     }
    3194         342 :     else if (flushblocks >= WalWriterFlushAfter)
    3195             :     {
    3196             :         /* exceeded wal_writer_flush_after blocks, flush */
    3197         304 :         WriteRqst.Flush = WriteRqst.Write;
    3198         304 :         lastflush = now;
    3199             :     }
    3200             :     else
    3201             :     {
    3202             :         /* no flushing, this time round */
    3203          38 :         WriteRqst.Flush = 0;
    3204             :     }
    3205             : 
    3206             : #ifdef WAL_DEBUG
    3207             :     if (XLOG_DEBUG)
    3208             :         elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
    3209             :              LSN_FORMAT_ARGS(WriteRqst.Write),
    3210             :              LSN_FORMAT_ARGS(WriteRqst.Flush),
    3211             :              LSN_FORMAT_ARGS(LogwrtResult.Write),
    3212             :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    3213             : #endif
    3214             : 
    3215        9358 :     START_CRIT_SECTION();
    3216             : 
    3217             :     /* now wait for any in-progress insertions to finish and get write lock */
    3218        9358 :     WaitXLogInsertionsToFinish(WriteRqst.Write);
    3219        9358 :     LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    3220        9358 :     RefreshXLogWriteResult(LogwrtResult);
    3221        9358 :     if (WriteRqst.Write > LogwrtResult.Write ||
    3222         282 :         WriteRqst.Flush > LogwrtResult.Flush)
    3223             :     {
    3224        9132 :         XLogWrite(WriteRqst, insertTLI, flexible);
    3225             :     }
    3226        9358 :     LWLockRelease(WALWriteLock);
    3227             : 
    3228        9358 :     END_CRIT_SECTION();
    3229             : 
    3230             :     /* wake up walsenders now that we've released heavily contended locks */
    3231        9358 :     WalSndWakeupProcessRequests(true, !RecoveryInProgress());
    3232             : 
    3233             :     /*
    3234             :      * Great, done. To take some work off the critical path, try to initialize
    3235             :      * as many of the no-longer-needed WAL buffers for future use as we can.
    3236             :      */
    3237        9358 :     AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
    3238             : 
    3239             :     /*
    3240             :      * If we determined that we need to write data, but somebody else
    3241             :      * wrote/flushed already, it should be considered as being active, to
    3242             :      * avoid hibernating too early.
    3243             :      */
    3244        9358 :     return true;
    3245             : }
    3246             : 
    3247             : /*
    3248             :  * Test whether XLOG data has been flushed up to (at least) the given position.
    3249             :  *
    3250             :  * Returns true if a flush is still needed.  (It may be that someone else
    3251             :  * is already in process of flushing that far, however.)
    3252             :  */
    3253             : bool
    3254    18219862 : XLogNeedsFlush(XLogRecPtr record)
    3255             : {
    3256             :     /*
    3257             :      * During recovery, we don't flush WAL but update minRecoveryPoint
    3258             :      * instead. So "needs flush" is taken to mean whether minRecoveryPoint
    3259             :      * would need to be updated.
    3260             :      */
    3261    18219862 :     if (RecoveryInProgress())
    3262             :     {
    3263             :         /*
    3264             :          * An invalid minRecoveryPoint means that we need to recover all the
    3265             :          * WAL, i.e., we're doing crash recovery.  We never modify the control
    3266             :          * file's value in that case, so we can short-circuit future checks
    3267             :          * here too.  This triggers a quick exit path for the startup process,
    3268             :          * which cannot update its local copy of minRecoveryPoint as long as
    3269             :          * it has not replayed all WAL available when doing crash recovery.
    3270             :          */
    3271     1241798 :         if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
    3272           0 :             updateMinRecoveryPoint = false;
    3273             : 
    3274             :         /* Quick exit if already known to be updated or cannot be updated */
    3275     1241798 :         if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
    3276     1218130 :             return false;
    3277             : 
    3278             :         /*
    3279             :          * Update local copy of minRecoveryPoint. But if the lock is busy,
    3280             :          * just return a conservative guess.
    3281             :          */
    3282       23668 :         if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
    3283           0 :             return true;
    3284       23668 :         LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    3285       23668 :         LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    3286       23668 :         LWLockRelease(ControlFileLock);
    3287             : 
    3288             :         /*
    3289             :          * Check minRecoveryPoint for any other process than the startup
    3290             :          * process doing crash recovery, which should not update the control
    3291             :          * file value if crash recovery is still running.
    3292             :          */
    3293       23668 :         if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
    3294           0 :             updateMinRecoveryPoint = false;
    3295             : 
    3296             :         /* check again */
    3297       23668 :         if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
    3298         160 :             return false;
    3299             :         else
    3300       23508 :             return true;
    3301             :     }
    3302             : 
    3303             :     /* Quick exit if already known flushed */
    3304    16978064 :     if (record <= LogwrtResult.Flush)
    3305    16536106 :         return false;
    3306             : 
    3307             :     /* read LogwrtResult and update local state */
    3308      441958 :     RefreshXLogWriteResult(LogwrtResult);
    3309             : 
    3310             :     /* check again */
    3311      441958 :     if (record <= LogwrtResult.Flush)
    3312        6902 :         return false;
    3313             : 
    3314      435056 :     return true;
    3315             : }
    3316             : 
    3317             : /*
    3318             :  * Try to make a given XLOG file segment exist.
    3319             :  *
    3320             :  * logsegno: identify segment.
    3321             :  *
    3322             :  * *added: on return, true if this call raised the number of extant segments.
    3323             :  *
    3324             :  * path: on return, this char[MAXPGPATH] has the path to the logsegno file.
    3325             :  *
    3326             :  * Returns -1 or FD of opened file.  A -1 here is not an error; a caller
    3327             :  * wanting an open segment should attempt to open "path", which usually will
    3328             :  * succeed.  (This is weird, but it's efficient for the callers.)
    3329             :  */
    3330             : static int
    3331       28538 : XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
    3332             :                      bool *added, char *path)
    3333             : {
    3334             :     char        tmppath[MAXPGPATH];
    3335             :     XLogSegNo   installed_segno;
    3336             :     XLogSegNo   max_segno;
    3337             :     int         fd;
    3338             :     int         save_errno;
    3339       28538 :     int         open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
    3340             :     instr_time  io_start;
    3341             : 
    3342             :     Assert(logtli != 0);
    3343             : 
    3344       28538 :     XLogFilePath(path, logtli, logsegno, wal_segment_size);
    3345             : 
    3346             :     /*
    3347             :      * Try to use existent file (checkpoint maker may have created it already)
    3348             :      */
    3349       28538 :     *added = false;
    3350       28538 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    3351       28538 :                        get_sync_bit(wal_sync_method));
    3352       28538 :     if (fd < 0)
    3353             :     {
    3354        2920 :         if (errno != ENOENT)
    3355           0 :             ereport(ERROR,
    3356             :                     (errcode_for_file_access(),
    3357             :                      errmsg("could not open file \"%s\": %m", path)));
    3358             :     }
    3359             :     else
    3360       25618 :         return fd;
    3361             : 
    3362             :     /*
    3363             :      * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
    3364             :      * another process is doing the same thing.  If so, we will end up
    3365             :      * pre-creating an extra log segment.  That seems OK, and better than
    3366             :      * holding the lock throughout this lengthy process.
    3367             :      */
    3368        2920 :     elog(DEBUG2, "creating and filling new WAL file");
    3369             : 
    3370        2920 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3371             : 
    3372        2920 :     unlink(tmppath);
    3373             : 
    3374        2920 :     if (io_direct_flags & IO_DIRECT_WAL_INIT)
    3375           0 :         open_flags |= PG_O_DIRECT;
    3376             : 
    3377             :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    3378        2920 :     fd = BasicOpenFile(tmppath, open_flags);
    3379        2920 :     if (fd < 0)
    3380           0 :         ereport(ERROR,
    3381             :                 (errcode_for_file_access(),
    3382             :                  errmsg("could not create file \"%s\": %m", tmppath)));
    3383             : 
    3384             :     /* Measure I/O timing when initializing segment */
    3385        2920 :     io_start = pgstat_prepare_io_time(track_wal_io_timing);
    3386             : 
    3387        2920 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
    3388        2920 :     save_errno = 0;
    3389        2920 :     if (wal_init_zero)
    3390             :     {
    3391             :         ssize_t     rc;
    3392             : 
    3393             :         /*
    3394             :          * Zero-fill the file.  With this setting, we do this the hard way to
    3395             :          * ensure that all the file space has really been allocated.  On
    3396             :          * platforms that allow "holes" in files, just seeking to the end
    3397             :          * doesn't allocate intermediate space.  This way, we know that we
    3398             :          * have all the space and (after the fsync below) that all the
    3399             :          * indirect blocks are down on disk.  Therefore, fdatasync(2) or
    3400             :          * O_DSYNC will be sufficient to sync future writes to the log file.
    3401             :          */
    3402        2920 :         rc = pg_pwrite_zeros(fd, wal_segment_size, 0);
    3403             : 
    3404        2920 :         if (rc < 0)
    3405           0 :             save_errno = errno;
    3406             :     }
    3407             :     else
    3408             :     {
    3409             :         /*
    3410             :          * Otherwise, seeking to the end and writing a solitary byte is
    3411             :          * enough.
    3412             :          */
    3413           0 :         errno = 0;
    3414           0 :         if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
    3415             :         {
    3416             :             /* if write didn't set errno, assume no disk space */
    3417           0 :             save_errno = errno ? errno : ENOSPC;
    3418             :         }
    3419             :     }
    3420        2920 :     pgstat_report_wait_end();
    3421             : 
    3422             :     /*
    3423             :      * A full segment worth of data is written when using wal_init_zero. One
    3424             :      * byte is written when not using it.
    3425             :      */
    3426        2920 :     pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE,
    3427             :                             io_start, 1,
    3428        2920 :                             wal_init_zero ? wal_segment_size : 1);
    3429             : 
    3430        2920 :     if (save_errno)
    3431             :     {
    3432             :         /*
    3433             :          * If we fail to make the file, delete it to release disk space
    3434             :          */
    3435           0 :         unlink(tmppath);
    3436             : 
    3437           0 :         close(fd);
    3438             : 
    3439           0 :         errno = save_errno;
    3440             : 
    3441           0 :         ereport(ERROR,
    3442             :                 (errcode_for_file_access(),
    3443             :                  errmsg("could not write to file \"%s\": %m", tmppath)));
    3444             :     }
    3445             : 
    3446             :     /* Measure I/O timing when flushing segment */
    3447        2920 :     io_start = pgstat_prepare_io_time(track_wal_io_timing);
    3448             : 
    3449        2920 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
    3450        2920 :     if (pg_fsync(fd) != 0)
    3451             :     {
    3452           0 :         save_errno = errno;
    3453           0 :         close(fd);
    3454           0 :         errno = save_errno;
    3455           0 :         ereport(ERROR,
    3456             :                 (errcode_for_file_access(),
    3457             :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3458             :     }
    3459        2920 :     pgstat_report_wait_end();
    3460             : 
    3461        2920 :     pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT,
    3462             :                             IOOP_FSYNC, io_start, 1, 0);
    3463             : 
    3464        2920 :     if (close(fd) != 0)
    3465           0 :         ereport(ERROR,
    3466             :                 (errcode_for_file_access(),
    3467             :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3468             : 
    3469             :     /*
    3470             :      * Now move the segment into place with its final name.  Cope with
    3471             :      * possibility that someone else has created the file while we were
    3472             :      * filling ours: if so, use ours to pre-create a future log segment.
    3473             :      */
    3474        2920 :     installed_segno = logsegno;
    3475             : 
    3476             :     /*
    3477             :      * XXX: What should we use as max_segno? We used to use XLOGfileslop when
    3478             :      * that was a constant, but that was always a bit dubious: normally, at a
    3479             :      * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
    3480             :      * here, it was the offset from the insert location. We can't do the
    3481             :      * normal XLOGfileslop calculation here because we don't have access to
    3482             :      * the prior checkpoint's redo location. So somewhat arbitrarily, just use
    3483             :      * CheckPointSegments.
    3484             :      */
    3485        2920 :     max_segno = logsegno + CheckPointSegments;
    3486        2920 :     if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
    3487             :                                logtli))
    3488             :     {
    3489        2920 :         *added = true;
    3490        2920 :         elog(DEBUG2, "done creating and filling new WAL file");
    3491             :     }
    3492             :     else
    3493             :     {
    3494             :         /*
    3495             :          * No need for any more future segments, or InstallXLogFileSegment()
    3496             :          * failed to rename the file into place. If the rename failed, a
    3497             :          * caller opening the file may fail.
    3498             :          */
    3499           0 :         unlink(tmppath);
    3500           0 :         elog(DEBUG2, "abandoned new WAL file");
    3501             :     }
    3502             : 
    3503        2920 :     return -1;
    3504             : }
    3505             : 
    3506             : /*
    3507             :  * Create a new XLOG file segment, or open a pre-existing one.
    3508             :  *
    3509             :  * logsegno: identify segment to be created/opened.
    3510             :  *
    3511             :  * Returns FD of opened file.
    3512             :  *
    3513             :  * Note: errors here are ERROR not PANIC because we might or might not be
    3514             :  * inside a critical section (eg, during checkpoint there is no reason to
    3515             :  * take down the system on failure).  They will promote to PANIC if we are
    3516             :  * in a critical section.
    3517             :  */
    3518             : int
    3519       28098 : XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
    3520             : {
    3521             :     bool        ignore_added;
    3522             :     char        path[MAXPGPATH];
    3523             :     int         fd;
    3524             : 
    3525             :     Assert(logtli != 0);
    3526             : 
    3527       28098 :     fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
    3528       28098 :     if (fd >= 0)
    3529       25374 :         return fd;
    3530             : 
    3531             :     /* Now open original target segment (might not be file I just made) */
    3532        2724 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    3533        2724 :                        get_sync_bit(wal_sync_method));
    3534        2724 :     if (fd < 0)
    3535           0 :         ereport(ERROR,
    3536             :                 (errcode_for_file_access(),
    3537             :                  errmsg("could not open file \"%s\": %m", path)));
    3538        2724 :     return fd;
    3539             : }
    3540             : 
    3541             : /*
    3542             :  * Create a new XLOG file segment by copying a pre-existing one.
    3543             :  *
    3544             :  * destsegno: identify segment to be created.
    3545             :  *
    3546             :  * srcTLI, srcsegno: identify segment to be copied (could be from
    3547             :  *      a different timeline)
    3548             :  *
    3549             :  * upto: how much of the source file to copy (the rest is filled with
    3550             :  *      zeros)
    3551             :  *
    3552             :  * Currently this is only used during recovery, and so there are no locking
    3553             :  * considerations.  But we should be just as tense as XLogFileInit to avoid
    3554             :  * emplacing a bogus file.
    3555             :  */
    3556             : static void
    3557          80 : XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
    3558             :              TimeLineID srcTLI, XLogSegNo srcsegno,
    3559             :              int upto)
    3560             : {
    3561             :     char        path[MAXPGPATH];
    3562             :     char        tmppath[MAXPGPATH];
    3563             :     PGAlignedXLogBlock buffer;
    3564             :     int         srcfd;
    3565             :     int         fd;
    3566             :     int         nbytes;
    3567             : 
    3568             :     /*
    3569             :      * Open the source file
    3570             :      */
    3571          80 :     XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
    3572          80 :     srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
    3573          80 :     if (srcfd < 0)
    3574           0 :         ereport(ERROR,
    3575             :                 (errcode_for_file_access(),
    3576             :                  errmsg("could not open file \"%s\": %m", path)));
    3577             : 
    3578             :     /*
    3579             :      * Copy into a temp file name.
    3580             :      */
    3581          80 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3582             : 
    3583          80 :     unlink(tmppath);
    3584             : 
    3585             :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    3586          80 :     fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
    3587          80 :     if (fd < 0)
    3588           0 :         ereport(ERROR,
    3589             :                 (errcode_for_file_access(),
    3590             :                  errmsg("could not create file \"%s\": %m", tmppath)));
    3591             : 
    3592             :     /*
    3593             :      * Do the data copying.
    3594             :      */
    3595      163920 :     for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
    3596             :     {
    3597             :         int         nread;
    3598             : 
    3599      163840 :         nread = upto - nbytes;
    3600             : 
    3601             :         /*
    3602             :          * The part that is not read from the source file is filled with
    3603             :          * zeros.
    3604             :          */
    3605      163840 :         if (nread < sizeof(buffer))
    3606          80 :             memset(buffer.data, 0, sizeof(buffer));
    3607             : 
    3608      163840 :         if (nread > 0)
    3609             :         {
    3610             :             int         r;
    3611             : 
    3612        5434 :             if (nread > sizeof(buffer))
    3613        5354 :                 nread = sizeof(buffer);
    3614        5434 :             pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
    3615        5434 :             r = read(srcfd, buffer.data, nread);
    3616        5434 :             if (r != nread)
    3617             :             {
    3618           0 :                 if (r < 0)
    3619           0 :                     ereport(ERROR,
    3620             :                             (errcode_for_file_access(),
    3621             :                              errmsg("could not read file \"%s\": %m",
    3622             :                                     path)));
    3623             :                 else
    3624           0 :                     ereport(ERROR,
    3625             :                             (errcode(ERRCODE_DATA_CORRUPTED),
    3626             :                              errmsg("could not read file \"%s\": read %d of %zu",
    3627             :                                     path, r, (Size) nread)));
    3628             :             }
    3629        5434 :             pgstat_report_wait_end();
    3630             :         }
    3631      163840 :         errno = 0;
    3632      163840 :         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
    3633      163840 :         if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
    3634             :         {
    3635           0 :             int         save_errno = errno;
    3636             : 
    3637             :             /*
    3638             :              * If we fail to make the file, delete it to release disk space
    3639             :              */
    3640           0 :             unlink(tmppath);
    3641             :             /* if write didn't set errno, assume problem is no disk space */
    3642           0 :             errno = save_errno ? save_errno : ENOSPC;
    3643             : 
    3644           0 :             ereport(ERROR,
    3645             :                     (errcode_for_file_access(),
    3646             :                      errmsg("could not write to file \"%s\": %m", tmppath)));
    3647             :         }
    3648      163840 :         pgstat_report_wait_end();
    3649             :     }
    3650             : 
    3651          80 :     pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
    3652          80 :     if (pg_fsync(fd) != 0)
    3653           0 :         ereport(data_sync_elevel(ERROR),
    3654             :                 (errcode_for_file_access(),
    3655             :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3656          80 :     pgstat_report_wait_end();
    3657             : 
    3658          80 :     if (CloseTransientFile(fd) != 0)
    3659           0 :         ereport(ERROR,
    3660             :                 (errcode_for_file_access(),
    3661             :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3662             : 
    3663          80 :     if (CloseTransientFile(srcfd) != 0)
    3664           0 :         ereport(ERROR,
    3665             :                 (errcode_for_file_access(),
    3666             :                  errmsg("could not close file \"%s\": %m", path)));
    3667             : 
    3668             :     /*
    3669             :      * Now move the segment into place with its final name.
    3670             :      */
    3671          80 :     if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
    3672           0 :         elog(ERROR, "InstallXLogFileSegment should not have failed");
    3673          80 : }
    3674             : 
    3675             : /*
    3676             :  * Install a new XLOG segment file as a current or future log segment.
    3677             :  *
    3678             :  * This is used both to install a newly-created segment (which has a temp
    3679             :  * filename while it's being created) and to recycle an old segment.
    3680             :  *
    3681             :  * *segno: identify segment to install as (or first possible target).
    3682             :  * When find_free is true, this is modified on return to indicate the
    3683             :  * actual installation location or last segment searched.
    3684             :  *
    3685             :  * tmppath: initial name of file to install.  It will be renamed into place.
    3686             :  *
    3687             :  * find_free: if true, install the new segment at the first empty segno
    3688             :  * number at or after the passed numbers.  If false, install the new segment
    3689             :  * exactly where specified, deleting any existing segment file there.
    3690             :  *
    3691             :  * max_segno: maximum segment number to install the new file as.  Fail if no
    3692             :  * free slot is found between *segno and max_segno. (Ignored when find_free
    3693             :  * is false.)
    3694             :  *
    3695             :  * tli: The timeline on which the new segment should be installed.
    3696             :  *
    3697             :  * Returns true if the file was installed successfully.  false indicates that
    3698             :  * max_segno limit was exceeded, the startup process has disabled this
    3699             :  * function for now, or an error occurred while renaming the file into place.
    3700             :  */
    3701             : static bool
    3702        5988 : InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
    3703             :                        bool find_free, XLogSegNo max_segno, TimeLineID tli)
    3704             : {
    3705             :     char        path[MAXPGPATH];
    3706             :     struct stat stat_buf;
    3707             : 
    3708             :     Assert(tli != 0);
    3709             : 
    3710        5988 :     XLogFilePath(path, tli, *segno, wal_segment_size);
    3711             : 
    3712        5988 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    3713        5988 :     if (!XLogCtl->InstallXLogFileSegmentActive)
    3714             :     {
    3715           0 :         LWLockRelease(ControlFileLock);
    3716           0 :         return false;
    3717             :     }
    3718             : 
    3719        5988 :     if (!find_free)
    3720             :     {
    3721             :         /* Force installation: get rid of any pre-existing segment file */
    3722          80 :         durable_unlink(path, DEBUG1);
    3723             :     }
    3724             :     else
    3725             :     {
    3726             :         /* Find a free slot to put it in */
    3727        8436 :         while (stat(path, &stat_buf) == 0)
    3728             :         {
    3729        2784 :             if ((*segno) >= max_segno)
    3730             :             {
    3731             :                 /* Failed to find a free slot within specified range */
    3732         256 :                 LWLockRelease(ControlFileLock);
    3733         256 :                 return false;
    3734             :             }
    3735        2528 :             (*segno)++;
    3736        2528 :             XLogFilePath(path, tli, *segno, wal_segment_size);
    3737             :         }
    3738             :     }
    3739             : 
    3740             :     Assert(access(path, F_OK) != 0 && errno == ENOENT);
    3741        5732 :     if (durable_rename(tmppath, path, LOG) != 0)
    3742             :     {
    3743           0 :         LWLockRelease(ControlFileLock);
    3744             :         /* durable_rename already emitted log message */
    3745           0 :         return false;
    3746             :     }
    3747             : 
    3748        5732 :     LWLockRelease(ControlFileLock);
    3749             : 
    3750        5732 :     return true;
    3751             : }
    3752             : 
    3753             : /*
    3754             :  * Open a pre-existing logfile segment for writing.
    3755             :  */
    3756             : int
    3757          66 : XLogFileOpen(XLogSegNo segno, TimeLineID tli)
    3758             : {
    3759             :     char        path[MAXPGPATH];
    3760             :     int         fd;
    3761             : 
    3762          66 :     XLogFilePath(path, tli, segno, wal_segment_size);
    3763             : 
    3764          66 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    3765          66 :                        get_sync_bit(wal_sync_method));
    3766          66 :     if (fd < 0)
    3767           0 :         ereport(PANIC,
    3768             :                 (errcode_for_file_access(),
    3769             :                  errmsg("could not open file \"%s\": %m", path)));
    3770             : 
    3771          66 :     return fd;
    3772             : }
    3773             : 
    3774             : /*
    3775             :  * Close the current logfile segment for writing.
    3776             :  */
    3777             : static void
    3778       12606 : XLogFileClose(void)
    3779             : {
    3780             :     Assert(openLogFile >= 0);
    3781             : 
    3782             :     /*
    3783             :      * WAL segment files will not be re-read in normal operation, so we advise
    3784             :      * the OS to release any cached pages.  But do not do so if WAL archiving
    3785             :      * or streaming is active, because archiver and walsender process could
    3786             :      * use the cache to read the WAL segment.
    3787             :      */
    3788             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
    3789       12606 :     if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
    3790        3058 :         (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
    3791             : #endif
    3792             : 
    3793       12606 :     if (close(openLogFile) != 0)
    3794             :     {
    3795             :         char        xlogfname[MAXFNAMELEN];
    3796           0 :         int         save_errno = errno;
    3797             : 
    3798           0 :         XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
    3799           0 :         errno = save_errno;
    3800           0 :         ereport(PANIC,
    3801             :                 (errcode_for_file_access(),
    3802             :                  errmsg("could not close file \"%s\": %m", xlogfname)));
    3803             :     }
    3804             : 
    3805       12606 :     openLogFile = -1;
    3806       12606 :     ReleaseExternalFD();
    3807       12606 : }
    3808             : 
    3809             : /*
    3810             :  * Preallocate log files beyond the specified log endpoint.
    3811             :  *
    3812             :  * XXX this is currently extremely conservative, since it forces only one
    3813             :  * future log segment to exist, and even that only if we are 75% done with
    3814             :  * the current one.  This is only appropriate for very low-WAL-volume systems.
    3815             :  * High-volume systems will be OK once they've built up a sufficient set of
    3816             :  * recycled log segments, but the startup transient is likely to include
    3817             :  * a lot of segment creations by foreground processes, which is not so good.
    3818             :  *
    3819             :  * XLogFileInitInternal() can ereport(ERROR).  All known causes indicate big
    3820             :  * trouble; for example, a full filesystem is one cause.  The checkpoint WAL
    3821             :  * and/or ControlFile updates already completed.  If a RequestCheckpoint()
    3822             :  * initiated the present checkpoint and an ERROR ends this function, the
    3823             :  * command that called RequestCheckpoint() fails.  That's not ideal, but it's
    3824             :  * not worth contorting more functions to use caller-specified elevel values.
    3825             :  * (With or without RequestCheckpoint(), an ERROR forestalls some inessential
    3826             :  * reporting and resource reclamation.)
    3827             :  */
    3828             : static void
    3829        3910 : PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
    3830             : {
    3831             :     XLogSegNo   _logSegNo;
    3832             :     int         lf;
    3833             :     bool        added;
    3834             :     char        path[MAXPGPATH];
    3835             :     uint64      offset;
    3836             : 
    3837        3910 :     if (!XLogCtl->InstallXLogFileSegmentActive)
    3838          14 :         return;                 /* unlocked check says no */
    3839             : 
    3840        3896 :     XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
    3841        3896 :     offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
    3842        3896 :     if (offset >= (uint32) (0.75 * wal_segment_size))
    3843             :     {
    3844         440 :         _logSegNo++;
    3845         440 :         lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
    3846         440 :         if (lf >= 0)
    3847         244 :             close(lf);
    3848         440 :         if (added)
    3849         196 :             CheckpointStats.ckpt_segs_added++;
    3850             :     }
    3851             : }
    3852             : 
    3853             : /*
    3854             :  * Throws an error if the given log segment has already been removed or
    3855             :  * recycled. The caller should only pass a segment that it knows to have
    3856             :  * existed while the server has been running, as this function always
    3857             :  * succeeds if no WAL segments have been removed since startup.
    3858             :  * 'tli' is only used in the error message.
    3859             :  *
    3860             :  * Note: this function guarantees to keep errno unchanged on return.
    3861             :  * This supports callers that use this to possibly deliver a better
    3862             :  * error message about a missing file, while still being able to throw
    3863             :  * a normal file-access error afterwards, if this does return.
    3864             :  */
    3865             : void
    3866      235016 : CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
    3867             : {
    3868      235016 :     int         save_errno = errno;
    3869             :     XLogSegNo   lastRemovedSegNo;
    3870             : 
    3871      235016 :     SpinLockAcquire(&XLogCtl->info_lck);
    3872      235016 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3873      235016 :     SpinLockRelease(&XLogCtl->info_lck);
    3874             : 
    3875      235016 :     if (segno <= lastRemovedSegNo)
    3876             :     {
    3877             :         char        filename[MAXFNAMELEN];
    3878             : 
    3879           0 :         XLogFileName(filename, tli, segno, wal_segment_size);
    3880           0 :         errno = save_errno;
    3881           0 :         ereport(ERROR,
    3882             :                 (errcode_for_file_access(),
    3883             :                  errmsg("requested WAL segment %s has already been removed",
    3884             :                         filename)));
    3885             :     }
    3886      235016 :     errno = save_errno;
    3887      235016 : }
    3888             : 
    3889             : /*
    3890             :  * Return the last WAL segment removed, or 0 if no segment has been removed
    3891             :  * since startup.
    3892             :  *
    3893             :  * NB: the result can be out of date arbitrarily fast, the caller has to deal
    3894             :  * with that.
    3895             :  */
    3896             : XLogSegNo
    3897        2110 : XLogGetLastRemovedSegno(void)
    3898             : {
    3899             :     XLogSegNo   lastRemovedSegNo;
    3900             : 
    3901        2110 :     SpinLockAcquire(&XLogCtl->info_lck);
    3902        2110 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3903        2110 :     SpinLockRelease(&XLogCtl->info_lck);
    3904             : 
    3905        2110 :     return lastRemovedSegNo;
    3906             : }
    3907             : 
    3908             : /*
    3909             :  * Return the oldest WAL segment on the given TLI that still exists in
    3910             :  * XLOGDIR, or 0 if none.
    3911             :  */
    3912             : XLogSegNo
    3913          10 : XLogGetOldestSegno(TimeLineID tli)
    3914             : {
    3915             :     DIR        *xldir;
    3916             :     struct dirent *xlde;
    3917          10 :     XLogSegNo   oldest_segno = 0;
    3918             : 
    3919          10 :     xldir = AllocateDir(XLOGDIR);
    3920          66 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3921             :     {
    3922             :         TimeLineID  file_tli;
    3923             :         XLogSegNo   file_segno;
    3924             : 
    3925             :         /* Ignore files that are not XLOG segments. */
    3926          56 :         if (!IsXLogFileName(xlde->d_name))
    3927          40 :             continue;
    3928             : 
    3929             :         /* Parse filename to get TLI and segno. */
    3930          16 :         XLogFromFileName(xlde->d_name, &file_tli, &file_segno,
    3931             :                          wal_segment_size);
    3932             : 
    3933             :         /* Ignore anything that's not from the TLI of interest. */
    3934          16 :         if (tli != file_tli)
    3935           0 :             continue;
    3936             : 
    3937             :         /* If it's the oldest so far, update oldest_segno. */
    3938          16 :         if (oldest_segno == 0 || file_segno < oldest_segno)
    3939          14 :             oldest_segno = file_segno;
    3940             :     }
    3941             : 
    3942          10 :     FreeDir(xldir);
    3943          10 :     return oldest_segno;
    3944             : }
    3945             : 
    3946             : /*
    3947             :  * Update the last removed segno pointer in shared memory, to reflect that the
    3948             :  * given XLOG file has been removed.
    3949             :  */
    3950             : static void
    3951        5226 : UpdateLastRemovedPtr(char *filename)
    3952             : {
    3953             :     uint32      tli;
    3954             :     XLogSegNo   segno;
    3955             : 
    3956        5226 :     XLogFromFileName(filename, &tli, &segno, wal_segment_size);
    3957             : 
    3958        5226 :     SpinLockAcquire(&XLogCtl->info_lck);
    3959        5226 :     if (segno > XLogCtl->lastRemovedSegNo)
    3960        2166 :         XLogCtl->lastRemovedSegNo = segno;
    3961        5226 :     SpinLockRelease(&XLogCtl->info_lck);
    3962        5226 : }
    3963             : 
    3964             : /*
    3965             :  * Remove all temporary log files in pg_wal
    3966             :  *
    3967             :  * This is called at the beginning of recovery after a previous crash,
    3968             :  * at a point where no other processes write fresh WAL data.
    3969             :  */
    3970             : static void
    3971         348 : RemoveTempXlogFiles(void)
    3972             : {
    3973             :     DIR        *xldir;
    3974             :     struct dirent *xlde;
    3975             : 
    3976         348 :     elog(DEBUG2, "removing all temporary WAL segments");
    3977             : 
    3978         348 :     xldir = AllocateDir(XLOGDIR);
    3979        2294 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3980             :     {
    3981             :         char        path[MAXPGPATH];
    3982             : 
    3983        1946 :         if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
    3984        1946 :             continue;
    3985             : 
    3986           0 :         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
    3987           0 :         unlink(path);
    3988           0 :         elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
    3989             :     }
    3990         348 :     FreeDir(xldir);
    3991         348 : }
    3992             : 
    3993             : /*
    3994             :  * Recycle or remove all log files older or equal to passed segno.
    3995             :  *
    3996             :  * endptr is current (or recent) end of xlog, and lastredoptr is the
    3997             :  * redo pointer of the last checkpoint. These are used to determine
    3998             :  * whether we want to recycle rather than delete no-longer-wanted log files.
    3999             :  *
    4000             :  * insertTLI is the current timeline for XLOG insertion. Any recycled
    4001             :  * segments should be reused for this timeline.
    4002             :  */
    4003             : static void
    4004        3368 : RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
    4005             :                    TimeLineID insertTLI)
    4006             : {
    4007             :     DIR        *xldir;
    4008             :     struct dirent *xlde;
    4009             :     char        lastoff[MAXFNAMELEN];
    4010             :     XLogSegNo   endlogSegNo;
    4011             :     XLogSegNo   recycleSegNo;
    4012             : 
    4013             :     /* Initialize info about where to try to recycle to */
    4014        3368 :     XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
    4015        3368 :     recycleSegNo = XLOGfileslop(lastredoptr);
    4016             : 
    4017             :     /*
    4018             :      * Construct a filename of the last segment to be kept. The timeline ID
    4019             :      * doesn't matter, we ignore that in the comparison. (During recovery,
    4020             :      * InsertTimeLineID isn't set, so we can't use that.)
    4021             :      */
    4022        3368 :     XLogFileName(lastoff, 0, segno, wal_segment_size);
    4023             : 
    4024        3368 :     elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
    4025             :          lastoff);
    4026             : 
    4027        3368 :     xldir = AllocateDir(XLOGDIR);
    4028             : 
    4029       66164 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4030             :     {
    4031             :         /* Ignore files that are not XLOG segments */
    4032       62796 :         if (!IsXLogFileName(xlde->d_name) &&
    4033       14436 :             !IsPartialXLogFileName(xlde->d_name))
    4034       14428 :             continue;
    4035             : 
    4036             :         /*
    4037             :          * We ignore the timeline part of the XLOG segment identifiers in
    4038             :          * deciding whether a segment is still needed.  This ensures that we
    4039             :          * won't prematurely remove a segment from a parent timeline. We could
    4040             :          * probably be a little more proactive about removing segments of
    4041             :          * non-parent timelines, but that would be a whole lot more
    4042             :          * complicated.
    4043             :          *
    4044             :          * We use the alphanumeric sorting property of the filenames to decide
    4045             :          * which ones are earlier than the lastoff segment.
    4046             :          */
    4047       48368 :         if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
    4048             :         {
    4049       30968 :             if (XLogArchiveCheckDone(xlde->d_name))
    4050             :             {
    4051             :                 /* Update the last removed location in shared memory first */
    4052        5226 :                 UpdateLastRemovedPtr(xlde->d_name);
    4053             : 
    4054        5226 :                 RemoveXlogFile(xlde, recycleSegNo, &endlogSegNo, insertTLI);
    4055             :             }
    4056             :         }
    4057             :     }
    4058             : 
    4059        3368 :     FreeDir(xldir);
    4060        3368 : }
    4061             : 
    4062             : /*
    4063             :  * Recycle or remove WAL files that are not part of the given timeline's
    4064             :  * history.
    4065             :  *
    4066             :  * This is called during recovery, whenever we switch to follow a new
    4067             :  * timeline, and at the end of recovery when we create a new timeline. We
    4068             :  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
    4069             :  * might be leftover pre-allocated or recycled WAL segments on the old timeline
    4070             :  * that we haven't used yet, and contain garbage. If we just leave them in
    4071             :  * pg_wal, they will eventually be archived, and we can't let that happen.
    4072             :  * Files that belong to our timeline history are valid, because we have
    4073             :  * successfully replayed them, but from others we can't be sure.
    4074             :  *
    4075             :  * 'switchpoint' is the current point in WAL where we switch to new timeline,
    4076             :  * and 'newTLI' is the new timeline we switch to.
    4077             :  */
    4078             : void
    4079         120 : RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
    4080             : {
    4081             :     DIR        *xldir;
    4082             :     struct dirent *xlde;
    4083             :     char        switchseg[MAXFNAMELEN];
    4084             :     XLogSegNo   endLogSegNo;
    4085             :     XLogSegNo   switchLogSegNo;
    4086             :     XLogSegNo   recycleSegNo;
    4087             : 
    4088             :     /*
    4089             :      * Initialize info about where to begin the work.  This will recycle,
    4090             :      * somewhat arbitrarily, 10 future segments.
    4091             :      */
    4092         120 :     XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
    4093         120 :     XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
    4094         120 :     recycleSegNo = endLogSegNo + 10;
    4095             : 
    4096             :     /*
    4097             :      * Construct a filename of the last segment to be kept.
    4098             :      */
    4099         120 :     XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
    4100             : 
    4101         120 :     elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
    4102             :          switchseg);
    4103             : 
    4104         120 :     xldir = AllocateDir(XLOGDIR);
    4105             : 
    4106        1144 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4107             :     {
    4108             :         /* Ignore files that are not XLOG segments */
    4109        1024 :         if (!IsXLogFileName(xlde->d_name))
    4110         636 :             continue;
    4111             : 
    4112             :         /*
    4113             :          * Remove files that are on a timeline older than the new one we're
    4114             :          * switching to, but with a segment number >= the first segment on the
    4115             :          * new timeline.
    4116             :          */
    4117         388 :         if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
    4118         250 :             strcmp(xlde->d_name + 8, switchseg + 8) > 0)
    4119             :         {
    4120             :             /*
    4121             :              * If the file has already been marked as .ready, however, don't
    4122             :              * remove it yet. It should be OK to remove it - files that are
    4123             :              * not part of our timeline history are not required for recovery
    4124             :              * - but seems safer to let them be archived and removed later.
    4125             :              */
    4126          32 :             if (!XLogArchiveIsReady(xlde->d_name))
    4127          32 :                 RemoveXlogFile(xlde, recycleSegNo, &endLogSegNo, newTLI);
    4128             :         }
    4129             :     }
    4130             : 
    4131         120 :     FreeDir(xldir);
    4132         120 : }
    4133             : 
    4134             : /*
    4135             :  * Recycle or remove a log file that's no longer needed.
    4136             :  *
    4137             :  * segment_de is the dirent structure of the segment to recycle or remove.
    4138             :  * recycleSegNo is the segment number to recycle up to.  endlogSegNo is
    4139             :  * the segment number of the current (or recent) end of WAL.
    4140             :  *
    4141             :  * endlogSegNo gets incremented if the segment is recycled so as it is not
    4142             :  * checked again with future callers of this function.
    4143             :  *
    4144             :  * insertTLI is the current timeline for XLOG insertion. Any recycled segments
    4145             :  * should be used for this timeline.
    4146             :  */
    4147             : static void
    4148        5258 : RemoveXlogFile(const struct dirent *segment_de,
    4149             :                XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
    4150             :                TimeLineID insertTLI)
    4151             : {
    4152             :     char        path[MAXPGPATH];
    4153             : #ifdef WIN32
    4154             :     char        newpath[MAXPGPATH];
    4155             : #endif
    4156        5258 :     const char *segname = segment_de->d_name;
    4157             : 
    4158        5258 :     snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
    4159             : 
    4160             :     /*
    4161             :      * Before deleting the file, see if it can be recycled as a future log
    4162             :      * segment. Only recycle normal files, because we don't want to recycle
    4163             :      * symbolic links pointing to a separate archive directory.
    4164             :      */
    4165        5258 :     if (wal_recycle &&
    4166        5258 :         *endlogSegNo <= recycleSegNo &&
    4167        6626 :         XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */
    4168        5976 :         get_dirent_type(path, segment_de, false, DEBUG2) == PGFILETYPE_REG &&
    4169        2988 :         InstallXLogFileSegment(endlogSegNo, path,
    4170             :                                true, recycleSegNo, insertTLI))
    4171             :     {
    4172        2732 :         ereport(DEBUG2,
    4173             :                 (errmsg_internal("recycled write-ahead log file \"%s\"",
    4174             :                                  segname)));
    4175        2732 :         CheckpointStats.ckpt_segs_recycled++;
    4176             :         /* Needn't recheck that slot on future iterations */
    4177        2732 :         (*endlogSegNo)++;
    4178             :     }
    4179             :     else
    4180             :     {
    4181             :         /* No need for any more future segments, or recycling failed ... */
    4182             :         int         rc;
    4183             : 
    4184        2526 :         ereport(DEBUG2,
    4185             :                 (errmsg_internal("removing write-ahead log file \"%s\"",
    4186             :                                  segname)));
    4187             : 
    4188             : #ifdef WIN32
    4189             : 
    4190             :         /*
    4191             :          * On Windows, if another process (e.g another backend) holds the file
    4192             :          * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
    4193             :          * will still show up in directory listing until the last handle is
    4194             :          * closed. To avoid confusing the lingering deleted file for a live
    4195             :          * WAL file that needs to be archived, rename it before deleting it.
    4196             :          *
    4197             :          * If another process holds the file open without FILE_SHARE_DELETE
    4198             :          * flag, rename will fail. We'll try again at the next checkpoint.
    4199             :          */
    4200             :         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
    4201             :         if (rename(path, newpath) != 0)
    4202             :         {
    4203             :             ereport(LOG,
    4204             :                     (errcode_for_file_access(),
    4205             :                      errmsg("could not rename file \"%s\": %m",
    4206             :                             path)));
    4207             :             return;
    4208             :         }
    4209             :         rc = durable_unlink(newpath, LOG);
    4210             : #else
    4211        2526 :         rc = durable_unlink(path, LOG);
    4212             : #endif
    4213        2526 :         if (rc != 0)
    4214             :         {
    4215             :             /* Message already logged by durable_unlink() */
    4216           0 :             return;
    4217             :         }
    4218        2526 :         CheckpointStats.ckpt_segs_removed++;
    4219             :     }
    4220             : 
    4221        5258 :     XLogArchiveCleanup(segname);
    4222             : }
    4223             : 
    4224             : /*
    4225             :  * Verify whether pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
    4226             :  * If the latter do not exist, recreate them.
    4227             :  *
    4228             :  * It is not the goal of this function to verify the contents of these
    4229             :  * directories, but to help in cases where someone has performed a cluster
    4230             :  * copy for PITR purposes but omitted pg_wal from the copy.
    4231             :  *
    4232             :  * We could also recreate pg_wal if it doesn't exist, but a deliberate
    4233             :  * policy decision was made not to.  It is fairly common for pg_wal to be
    4234             :  * a symlink, and if that was the DBA's intent then automatically making a
    4235             :  * plain directory would result in degraded performance with no notice.
    4236             :  */
    4237             : static void
    4238        1846 : ValidateXLOGDirectoryStructure(void)
    4239             : {
    4240             :     char        path[MAXPGPATH];
    4241             :     struct stat stat_buf;
    4242             : 
    4243             :     /* Check for pg_wal; if it doesn't exist, error out */
    4244        1846 :     if (stat(XLOGDIR, &stat_buf) != 0 ||
    4245        1846 :         !S_ISDIR(stat_buf.st_mode))
    4246           0 :         ereport(FATAL,
    4247             :                 (errcode_for_file_access(),
    4248             :                  errmsg("required WAL directory \"%s\" does not exist",
    4249             :                         XLOGDIR)));
    4250             : 
    4251             :     /* Check for archive_status */
    4252        1846 :     snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
    4253        1846 :     if (stat(path, &stat_buf) == 0)
    4254             :     {
    4255             :         /* Check for weird cases where it exists but isn't a directory */
    4256        1844 :         if (!S_ISDIR(stat_buf.st_mode))
    4257           0 :             ereport(FATAL,
    4258             :                     (errcode_for_file_access(),
    4259             :                      errmsg("required WAL directory \"%s\" does not exist",
    4260             :                             path)));
    4261             :     }
    4262             :     else
    4263             :     {
    4264           2 :         ereport(LOG,
    4265             :                 (errmsg("creating missing WAL directory \"%s\"", path)));
    4266           2 :         if (MakePGDirectory(path) < 0)
    4267           0 :             ereport(FATAL,
    4268             :                     (errcode_for_file_access(),
    4269             :                      errmsg("could not create missing directory \"%s\": %m",
    4270             :                             path)));
    4271             :     }
    4272             : 
    4273             :     /* Check for summaries */
    4274        1846 :     snprintf(path, MAXPGPATH, XLOGDIR "/summaries");
    4275        1846 :     if (stat(path, &stat_buf) == 0)
    4276             :     {
    4277             :         /* Check for weird cases where it exists but isn't a directory */
    4278        1844 :         if (!S_ISDIR(stat_buf.st_mode))
    4279           0 :             ereport(FATAL,
    4280             :                     (errmsg("required WAL directory \"%s\" does not exist",
    4281             :                             path)));
    4282             :     }
    4283             :     else
    4284             :     {
    4285           2 :         ereport(LOG,
    4286             :                 (errmsg("creating missing WAL directory \"%s\"", path)));
    4287           2 :         if (MakePGDirectory(path) < 0)
    4288           0 :             ereport(FATAL,
    4289             :                     (errmsg("could not create missing directory \"%s\": %m",
    4290             :                             path)));
    4291             :     }
    4292        1846 : }
    4293             : 
    4294             : /*
    4295             :  * Remove previous backup history files.  This also retries creation of
    4296             :  * .ready files for any backup history files for which XLogArchiveNotify
    4297             :  * failed earlier.
    4298             :  */
    4299             : static void
    4300         296 : CleanupBackupHistory(void)
    4301             : {
    4302             :     DIR        *xldir;
    4303             :     struct dirent *xlde;
    4304             :     char        path[MAXPGPATH + sizeof(XLOGDIR)];
    4305             : 
    4306         296 :     xldir = AllocateDir(XLOGDIR);
    4307             : 
    4308        2978 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4309             :     {
    4310        2386 :         if (IsBackupHistoryFileName(xlde->d_name))
    4311             :         {
    4312         312 :             if (XLogArchiveCheckDone(xlde->d_name))
    4313             :             {
    4314         246 :                 elog(DEBUG2, "removing WAL backup history file \"%s\"",
    4315             :                      xlde->d_name);
    4316         246 :                 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
    4317         246 :                 unlink(path);
    4318         246 :                 XLogArchiveCleanup(xlde->d_name);
    4319             :             }
    4320             :         }
    4321             :     }
    4322             : 
    4323         296 :     FreeDir(xldir);
    4324         296 : }
    4325             : 
    4326             : /*
    4327             :  * I/O routines for pg_control
    4328             :  *
    4329             :  * *ControlFile is a buffer in shared memory that holds an image of the
    4330             :  * contents of pg_control.  WriteControlFile() initializes pg_control
    4331             :  * given a preloaded buffer, ReadControlFile() loads the buffer from
    4332             :  * the pg_control file (during postmaster or standalone-backend startup),
    4333             :  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
    4334             :  * InitControlFile() fills the buffer with initial values.
    4335             :  *
    4336             :  * For simplicity, WriteControlFile() initializes the fields of pg_control
    4337             :  * that are related to checking backend/database compatibility, and
    4338             :  * ReadControlFile() verifies they are correct.  We could split out the
    4339             :  * I/O and compatibility-check functions, but there seems no need currently.
    4340             :  */
    4341             : 
    4342             : static void
    4343         102 : InitControlFile(uint64 sysidentifier, uint32 data_checksum_version)
    4344             : {
    4345             :     char        mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
    4346             : 
    4347             :     /*
    4348             :      * Generate a random nonce. This is used for authentication requests that
    4349             :      * will fail because the user does not exist. The nonce is used to create
    4350             :      * a genuine-looking password challenge for the non-existent user, in lieu
    4351             :      * of an actual stored password.
    4352             :      */
    4353         102 :     if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
    4354           0 :         ereport(PANIC,
    4355             :                 (errcode(ERRCODE_INTERNAL_ERROR),
    4356             :                  errmsg("could not generate secret authorization token")));
    4357             : 
    4358         102 :     memset(ControlFile, 0, sizeof(ControlFileData));
    4359             :     /* Initialize pg_control status fields */
    4360         102 :     ControlFile->system_identifier = sysidentifier;
    4361         102 :     memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
    4362         102 :     ControlFile->state = DB_SHUTDOWNED;
    4363         102 :     ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
    4364             : 
    4365             :     /* Set important parameter values for use when replaying WAL */
    4366         102 :     ControlFile->MaxConnections = MaxConnections;
    4367         102 :     ControlFile->max_worker_processes = max_worker_processes;
    4368         102 :     ControlFile->max_wal_senders = max_wal_senders;
    4369         102 :     ControlFile->max_prepared_xacts = max_prepared_xacts;
    4370         102 :     ControlFile->max_locks_per_xact = max_locks_per_xact;
    4371         102 :     ControlFile->wal_level = wal_level;
    4372         102 :     ControlFile->wal_log_hints = wal_log_hints;
    4373         102 :     ControlFile->track_commit_timestamp = track_commit_timestamp;
    4374         102 :     ControlFile->data_checksum_version = data_checksum_version;
    4375         102 : }
    4376             : 
    4377             : static void
    4378         102 : WriteControlFile(void)
    4379             : {
    4380             :     int         fd;
    4381             :     char        buffer[PG_CONTROL_FILE_SIZE];   /* need not be aligned */
    4382             : 
    4383             :     /*
    4384             :      * Initialize version and compatibility-check fields
    4385             :      */
    4386         102 :     ControlFile->pg_control_version = PG_CONTROL_VERSION;
    4387         102 :     ControlFile->catalog_version_no = CATALOG_VERSION_NO;
    4388             : 
    4389         102 :     ControlFile->maxAlign = MAXIMUM_ALIGNOF;
    4390         102 :     ControlFile->floatFormat = FLOATFORMAT_VALUE;
    4391             : 
    4392         102 :     ControlFile->blcksz = BLCKSZ;
    4393         102 :     ControlFile->relseg_size = RELSEG_SIZE;
    4394         102 :     ControlFile->xlog_blcksz = XLOG_BLCKSZ;
    4395         102 :     ControlFile->xlog_seg_size = wal_segment_size;
    4396             : 
    4397         102 :     ControlFile->nameDataLen = NAMEDATALEN;
    4398         102 :     ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
    4399             : 
    4400         102 :     ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
    4401         102 :     ControlFile->loblksize = LOBLKSIZE;
    4402             : 
    4403         102 :     ControlFile->float8ByVal = FLOAT8PASSBYVAL;
    4404             : 
    4405             :     /*
    4406             :      * Initialize the default 'char' signedness.
    4407             :      *
    4408             :      * The signedness of the char type is implementation-defined. For instance
    4409             :      * on x86 architecture CPUs, the char data type is typically treated as
    4410             :      * signed by default, whereas on aarch architecture CPUs, it is typically
    4411             :      * treated as unsigned by default. In v17 or earlier, we accidentally let
    4412             :      * C implementation signedness affect persistent data. This led to
    4413             :      * inconsistent results when comparing char data across different
    4414             :      * platforms.
    4415             :      *
    4416             :      * This flag can be used as a hint to ensure consistent behavior for
    4417             :      * pre-v18 data files that store data sorted by the 'char' type on disk,
    4418             :      * especially in cross-platform replication scenarios.
    4419             :      *
    4420             :      * Newly created database clusters unconditionally set the default char
    4421             :      * signedness to true. pg_upgrade changes this flag for clusters that were
    4422             :      * initialized on signedness=false platforms. As a result,
    4423             :      * signedness=false setting will become rare over time. If we had known
    4424             :      * about this problem during the last development cycle that forced initdb
    4425             :      * (v8.3), we would have made all clusters signed or all clusters
    4426             :      * unsigned. Making pg_upgrade the only source of signedness=false will
    4427             :      * cause the population of database clusters to converge toward that
    4428             :      * retrospective ideal.
    4429             :      */
    4430         102 :     ControlFile->default_char_signedness = true;
    4431             : 
    4432             :     /* Contents are protected with a CRC */
    4433         102 :     INIT_CRC32C(ControlFile->crc);
    4434         102 :     COMP_CRC32C(ControlFile->crc,
    4435             :                 ControlFile,
    4436             :                 offsetof(ControlFileData, crc));
    4437         102 :     FIN_CRC32C(ControlFile->crc);
    4438             : 
    4439             :     /*
    4440             :      * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
    4441             :      * the excess over sizeof(ControlFileData).  This reduces the odds of
    4442             :      * premature-EOF errors when reading pg_control.  We'll still fail when we
    4443             :      * check the contents of the file, but hopefully with a more specific
    4444             :      * error than "couldn't read pg_control".
    4445             :      */
    4446         102 :     memset(buffer, 0, PG_CONTROL_FILE_SIZE);
    4447         102 :     memcpy(buffer, ControlFile, sizeof(ControlFileData));
    4448             : 
    4449         102 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4450             :                        O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
    4451         102 :     if (fd < 0)
    4452           0 :         ereport(PANIC,
    4453             :                 (errcode_for_file_access(),
    4454             :                  errmsg("could not create file \"%s\": %m",
    4455             :                         XLOG_CONTROL_FILE)));
    4456             : 
    4457         102 :     errno = 0;
    4458         102 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
    4459         102 :     if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
    4460             :     {
    4461             :         /* if write didn't set errno, assume problem is no disk space */
    4462           0 :         if (errno == 0)
    4463           0 :             errno = ENOSPC;
    4464           0 :         ereport(PANIC,
    4465             :                 (errcode_for_file_access(),
    4466             :                  errmsg("could not write to file \"%s\": %m",
    4467             :                         XLOG_CONTROL_FILE)));
    4468             :     }
    4469         102 :     pgstat_report_wait_end();
    4470             : 
    4471         102 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
    4472         102 :     if (pg_fsync(fd) != 0)
    4473           0 :         ereport(PANIC,
    4474             :                 (errcode_for_file_access(),
    4475             :                  errmsg("could not fsync file \"%s\": %m",
    4476             :                         XLOG_CONTROL_FILE)));
    4477         102 :     pgstat_report_wait_end();
    4478             : 
    4479         102 :     if (close(fd) != 0)
    4480           0 :         ereport(PANIC,
    4481             :                 (errcode_for_file_access(),
    4482             :                  errmsg("could not close file \"%s\": %m",
    4483             :                         XLOG_CONTROL_FILE)));
    4484         102 : }
    4485             : 
    4486             : static void
    4487        1946 : ReadControlFile(void)
    4488             : {
    4489             :     pg_crc32c   crc;
    4490             :     int         fd;
    4491             :     char        wal_segsz_str[20];
    4492             :     int         r;
    4493             : 
    4494             :     /*
    4495             :      * Read data...
    4496             :      */
    4497        1946 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4498             :                        O_RDWR | PG_BINARY);
    4499        1946 :     if (fd < 0)
    4500           0 :         ereport(PANIC,
    4501             :                 (errcode_for_file_access(),
    4502             :                  errmsg("could not open file \"%s\": %m",
    4503             :                         XLOG_CONTROL_FILE)));
    4504             : 
    4505        1946 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
    4506        1946 :     r = read(fd, ControlFile, sizeof(ControlFileData));
    4507        1946 :     if (r != sizeof(ControlFileData))
    4508             :     {
    4509           0 :         if (r < 0)
    4510           0 :             ereport(PANIC,
    4511             :                     (errcode_for_file_access(),
    4512             :                      errmsg("could not read file \"%s\": %m",
    4513             :                             XLOG_CONTROL_FILE)));
    4514             :         else
    4515           0 :             ereport(PANIC,
    4516             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    4517             :                      errmsg("could not read file \"%s\": read %d of %zu",
    4518             :                             XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
    4519             :     }
    4520        1946 :     pgstat_report_wait_end();
    4521             : 
    4522        1946 :     close(fd);
    4523             : 
    4524             :     /*
    4525             :      * Check for expected pg_control format version.  If this is wrong, the
    4526             :      * CRC check will likely fail because we'll be checking the wrong number
    4527             :      * of bytes.  Complaining about wrong version will probably be more
    4528             :      * enlightening than complaining about wrong CRC.
    4529             :      */
    4530             : 
    4531        1946 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
    4532           0 :         ereport(FATAL,
    4533             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4534             :                  errmsg("database files are incompatible with server"),
    4535             :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
    4536             :                            " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
    4537             :                            ControlFile->pg_control_version, ControlFile->pg_control_version,
    4538             :                            PG_CONTROL_VERSION, PG_CONTROL_VERSION),
    4539             :                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
    4540             : 
    4541        1946 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
    4542           0 :         ereport(FATAL,
    4543             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4544             :                  errmsg("database files are incompatible with server"),
    4545             :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
    4546             :                            " but the server was compiled with PG_CONTROL_VERSION %d.",
    4547             :                            ControlFile->pg_control_version, PG_CONTROL_VERSION),
    4548             :                  errhint("It looks like you need to initdb.")));
    4549             : 
    4550             :     /* Now check the CRC. */
    4551        1946 :     INIT_CRC32C(crc);
    4552        1946 :     COMP_CRC32C(crc,
    4553             :                 ControlFile,
    4554             :                 offsetof(ControlFileData, crc));
    4555        1946 :     FIN_CRC32C(crc);
    4556             : 
    4557        1946 :     if (!EQ_CRC32C(crc, ControlFile->crc))
    4558           0 :         ereport(FATAL,
    4559             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4560             :                  errmsg("incorrect checksum in control file")));
    4561             : 
    4562             :     /*
    4563             :      * Do compatibility checking immediately.  If the database isn't
    4564             :      * compatible with the backend executable, we want to abort before we can
    4565             :      * possibly do any damage.
    4566             :      */
    4567        1946 :     if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
    4568           0 :         ereport(FATAL,
    4569             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4570             :                  errmsg("database files are incompatible with server"),
    4571             :         /* translator: %s is a variable name and %d is its value */
    4572             :                  errdetail("The database cluster was initialized with %s %d,"
    4573             :                            " but the server was compiled with %s %d.",
    4574             :                            "CATALOG_VERSION_NO", ControlFile->catalog_version_no,
    4575             :                            "CATALOG_VERSION_NO", CATALOG_VERSION_NO),
    4576             :                  errhint("It looks like you need to initdb.")));
    4577        1946 :     if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
    4578           0 :         ereport(FATAL,
    4579             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4580             :                  errmsg("database files are incompatible with server"),
    4581             :         /* translator: %s is a variable name and %d is its value */
    4582             :                  errdetail("The database cluster was initialized with %s %d,"
    4583             :                            " but the server was compiled with %s %d.",
    4584             :                            "MAXALIGN", ControlFile->maxAlign,
    4585             :                            "MAXALIGN", MAXIMUM_ALIGNOF),
    4586             :                  errhint("It looks like you need to initdb.")));
    4587        1946 :     if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
    4588           0 :         ereport(FATAL,
    4589             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4590             :                  errmsg("database files are incompatible with server"),
    4591             :                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
    4592             :                  errhint("It looks like you need to initdb.")));
    4593        1946 :     if (ControlFile->blcksz != BLCKSZ)
    4594           0 :         ereport(FATAL,
    4595             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4596             :                  errmsg("database files are incompatible with server"),
    4597             :         /* translator: %s is a variable name and %d is its value */
    4598             :                  errdetail("The database cluster was initialized with %s %d,"
    4599             :                            " but the server was compiled with %s %d.",
    4600             :                            "BLCKSZ", ControlFile->blcksz,
    4601             :                            "BLCKSZ", BLCKSZ),
    4602             :                  errhint("It looks like you need to recompile or initdb.")));
    4603        1946 :     if (ControlFile->relseg_size != RELSEG_SIZE)
    4604           0 :         ereport(FATAL,
    4605             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4606             :                  errmsg("database files are incompatible with server"),
    4607             :         /* translator: %s is a variable name and %d is its value */
    4608             :                  errdetail("The database cluster was initialized with %s %d,"
    4609             :                            " but the server was compiled with %s %d.",
    4610             :                            "RELSEG_SIZE", ControlFile->relseg_size,
    4611             :                            "RELSEG_SIZE", RELSEG_SIZE),
    4612             :                  errhint("It looks like you need to recompile or initdb.")));
    4613        1946 :     if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
    4614           0 :         ereport(FATAL,
    4615             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4616             :                  errmsg("database files are incompatible with server"),
    4617             :         /* translator: %s is a variable name and %d is its value */
    4618             :                  errdetail("The database cluster was initialized with %s %d,"
    4619             :                            " but the server was compiled with %s %d.",
    4620             :                            "XLOG_BLCKSZ", ControlFile->xlog_blcksz,
    4621             :                            "XLOG_BLCKSZ", XLOG_BLCKSZ),
    4622             :                  errhint("It looks like you need to recompile or initdb.")));
    4623        1946 :     if (ControlFile->nameDataLen != NAMEDATALEN)
    4624           0 :         ereport(FATAL,
    4625             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4626             :                  errmsg("database files are incompatible with server"),
    4627             :         /* translator: %s is a variable name and %d is its value */
    4628             :                  errdetail("The database cluster was initialized with %s %d,"
    4629             :                            " but the server was compiled with %s %d.",
    4630             :                            "NAMEDATALEN", ControlFile->nameDataLen,
    4631             :                            "NAMEDATALEN", NAMEDATALEN),
    4632             :                  errhint("It looks like you need to recompile or initdb.")));
    4633        1946 :     if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
    4634           0 :         ereport(FATAL,
    4635             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4636             :                  errmsg("database files are incompatible with server"),
    4637             :         /* translator: %s is a variable name and %d is its value */
    4638             :                  errdetail("The database cluster was initialized with %s %d,"
    4639             :                            " but the server was compiled with %s %d.",
    4640             :                            "INDEX_MAX_KEYS", ControlFile->indexMaxKeys,
    4641             :                            "INDEX_MAX_KEYS", INDEX_MAX_KEYS),
    4642             :                  errhint("It looks like you need to recompile or initdb.")));
    4643        1946 :     if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
    4644           0 :         ereport(FATAL,
    4645             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4646             :                  errmsg("database files are incompatible with server"),
    4647             :         /* translator: %s is a variable name and %d is its value */
    4648             :                  errdetail("The database cluster was initialized with %s %d,"
    4649             :                            " but the server was compiled with %s %d.",
    4650             :                            "TOAST_MAX_CHUNK_SIZE", ControlFile->toast_max_chunk_size,
    4651             :                            "TOAST_MAX_CHUNK_SIZE", (int) TOAST_MAX_CHUNK_SIZE),
    4652             :                  errhint("It looks like you need to recompile or initdb.")));
    4653        1946 :     if (ControlFile->loblksize != LOBLKSIZE)
    4654           0 :         ereport(FATAL,
    4655             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4656             :                  errmsg("database files are incompatible with server"),
    4657             :         /* translator: %s is a variable name and %d is its value */
    4658             :                  errdetail("The database cluster was initialized with %s %d,"
    4659             :                            " but the server was compiled with %s %d.",
    4660             :                            "LOBLKSIZE", ControlFile->loblksize,
    4661             :                            "LOBLKSIZE", (int) LOBLKSIZE),
    4662             :                  errhint("It looks like you need to recompile or initdb.")));
    4663             : 
    4664             : #ifdef USE_FLOAT8_BYVAL
    4665        1946 :     if (ControlFile->float8ByVal != true)
    4666           0 :         ereport(FATAL,
    4667             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4668             :                  errmsg("database files are incompatible with server"),
    4669             :                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
    4670             :                            " but the server was compiled with USE_FLOAT8_BYVAL."),
    4671             :                  errhint("It looks like you need to recompile or initdb.")));
    4672             : #else
    4673             :     if (ControlFile->float8ByVal != false)
    4674             :         ereport(FATAL,
    4675             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4676             :                  errmsg("database files are incompatible with server"),
    4677             :                  errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
    4678             :                            " but the server was compiled without USE_FLOAT8_BYVAL."),
    4679             :                  errhint("It looks like you need to recompile or initdb.")));
    4680             : #endif
    4681             : 
    4682        1946 :     wal_segment_size = ControlFile->xlog_seg_size;
    4683             : 
    4684        1946 :     if (!IsValidWalSegSize(wal_segment_size))
    4685           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4686             :                         errmsg_plural("invalid WAL segment size in control file (%d byte)",
    4687             :                                       "invalid WAL segment size in control file (%d bytes)",
    4688             :                                       wal_segment_size,
    4689             :                                       wal_segment_size),
    4690             :                         errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.")));
    4691             : 
    4692        1946 :     snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
    4693        1946 :     SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
    4694             :                     PGC_S_DYNAMIC_DEFAULT);
    4695             : 
    4696             :     /* check and update variables dependent on wal_segment_size */
    4697        1946 :     if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
    4698           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4699             :         /* translator: both %s are GUC names */
    4700             :                         errmsg("\"%s\" must be at least twice \"%s\"",
    4701             :                                "min_wal_size", "wal_segment_size")));
    4702             : 
    4703        1946 :     if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
    4704           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4705             :         /* translator: both %s are GUC names */
    4706             :                         errmsg("\"%s\" must be at least twice \"%s\"",
    4707             :                                "max_wal_size", "wal_segment_size")));
    4708             : 
    4709        1946 :     UsableBytesInSegment =
    4710        1946 :         (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
    4711             :         (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
    4712             : 
    4713        1946 :     CalculateCheckpointSegments();
    4714             : 
    4715             :     /* Make the initdb settings visible as GUC variables, too */
    4716        1946 :     SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
    4717             :                     PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
    4718        1946 : }
    4719             : 
    4720             : /*
    4721             :  * Utility wrapper to update the control file.  Note that the control
    4722             :  * file gets flushed.
    4723             :  */
    4724             : static void
    4725       17828 : UpdateControlFile(void)
    4726             : {
    4727       17828 :     update_controlfile(DataDir, ControlFile, true);
    4728       17828 : }
    4729             : 
    4730             : /*
    4731             :  * Returns the unique system identifier from control file.
    4732             :  */
    4733             : uint64
    4734        2708 : GetSystemIdentifier(void)
    4735             : {
    4736             :     Assert(ControlFile != NULL);
    4737        2708 :     return ControlFile->system_identifier;
    4738             : }
    4739             : 
    4740             : /*
    4741             :  * Returns the random nonce from control file.
    4742             :  */
    4743             : char *
    4744           2 : GetMockAuthenticationNonce(void)
    4745             : {
    4746             :     Assert(ControlFile != NULL);
    4747           2 :     return ControlFile->mock_authentication_nonce;
    4748             : }
    4749             : 
    4750             : /*
    4751             :  * Are checksums enabled for data pages?
    4752             :  */
    4753             : bool
    4754    20480240 : DataChecksumsEnabled(void)
    4755             : {
    4756             :     Assert(ControlFile != NULL);
    4757    20480240 :     return (ControlFile->data_checksum_version > 0);
    4758             : }
    4759             : 
    4760             : /*
    4761             :  * Return true if the cluster was initialized on a platform where the
    4762             :  * default signedness of char is "signed". This function exists for code
    4763             :  * that deals with pre-v18 data files that store data sorted by the 'char'
    4764             :  * type on disk (e.g., GIN and GiST indexes). See the comments in
    4765             :  * WriteControlFile() for details.
    4766             :  */
    4767             : bool
    4768           6 : GetDefaultCharSignedness(void)
    4769             : {
    4770           6 :     return ControlFile->default_char_signedness;
    4771             : }
    4772             : 
    4773             : /*
    4774             :  * Returns a fake LSN for unlogged relations.
    4775             :  *
    4776             :  * Each call generates an LSN that is greater than any previous value
    4777             :  * returned. The current counter value is saved and restored across clean
    4778             :  * shutdowns, but like unlogged relations, does not survive a crash. This can
    4779             :  * be used in lieu of real LSN values returned by XLogInsert, if you need an
    4780             :  * LSN-like increasing sequence of numbers without writing any WAL.
    4781             :  */
    4782             : XLogRecPtr
    4783          66 : GetFakeLSNForUnloggedRel(void)
    4784             : {
    4785          66 :     return pg_atomic_fetch_add_u64(&XLogCtl->unloggedLSN, 1);
    4786             : }
    4787             : 
    4788             : /*
    4789             :  * Auto-tune the number of XLOG buffers.
    4790             :  *
    4791             :  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
    4792             :  * a maximum of one XLOG segment (there is little reason to think that more
    4793             :  * is helpful, at least so long as we force an fsync when switching log files)
    4794             :  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
    4795             :  * 9.1, when auto-tuning was added).
    4796             :  *
    4797             :  * This should not be called until NBuffers has received its final value.
    4798             :  */
    4799             : static int
    4800        2130 : XLOGChooseNumBuffers(void)
    4801             : {
    4802             :     int         xbuffers;
    4803             : 
    4804        2130 :     xbuffers = NBuffers / 32;
    4805        2130 :     if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
    4806          54 :         xbuffers = (wal_segment_size / XLOG_BLCKSZ);
    4807        2130 :     if (xbuffers < 8)
    4808         778 :         xbuffers = 8;
    4809        2130 :     return xbuffers;
    4810             : }
    4811             : 
    4812             : /*
    4813             :  * GUC check_hook for wal_buffers
    4814             :  */
    4815             : bool
    4816        4338 : check_wal_buffers(int *newval, void **extra, GucSource source)
    4817             : {
    4818             :     /*
    4819             :      * -1 indicates a request for auto-tune.
    4820             :      */
    4821        4338 :     if (*newval == -1)
    4822             :     {
    4823             :         /*
    4824             :          * If we haven't yet changed the boot_val default of -1, just let it
    4825             :          * be.  We'll fix it when XLOGShmemSize is called.
    4826             :          */
    4827        2208 :         if (XLOGbuffers == -1)
    4828        2208 :             return true;
    4829             : 
    4830             :         /* Otherwise, substitute the auto-tune value */
    4831           0 :         *newval = XLOGChooseNumBuffers();
    4832             :     }
    4833             : 
    4834             :     /*
    4835             :      * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
    4836             :      * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
    4837             :      * the case, we just silently treat such values as a request for the
    4838             :      * minimum.  (We could throw an error instead, but that doesn't seem very
    4839             :      * helpful.)
    4840             :      */
    4841        2130 :     if (*newval < 4)
    4842           0 :         *newval = 4;
    4843             : 
    4844        2130 :     return true;
    4845             : }
    4846             : 
    4847             : /*
    4848             :  * GUC check_hook for wal_consistency_checking
    4849             :  */
    4850             : bool
    4851        4000 : check_wal_consistency_checking(char **newval, void **extra, GucSource source)
    4852             : {
    4853             :     char       *rawstring;
    4854             :     List       *elemlist;
    4855             :     ListCell   *l;
    4856             :     bool        newwalconsistency[RM_MAX_ID + 1];
    4857             : 
    4858             :     /* Initialize the array */
    4859      132000 :     MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool));
    4860             : 
    4861             :     /* Need a modifiable copy of string */
    4862        4000 :     rawstring = pstrdup(*newval);
    4863             : 
    4864             :     /* Parse string into list of identifiers */
    4865        4000 :     if (!SplitIdentifierString(rawstring, ',', &elemlist))
    4866             :     {
    4867             :         /* syntax error in list */
    4868           0 :         GUC_check_errdetail("List syntax is invalid.");
    4869           0 :         pfree(rawstring);
    4870           0 :         list_free(elemlist);
    4871           0 :         return false;
    4872             :     }
    4873             : 
    4874        4902 :     foreach(l, elemlist)
    4875             :     {
    4876         902 :         char       *tok = (char *) lfirst(l);
    4877             :         int         rmid;
    4878             : 
    4879             :         /* Check for 'all'. */
    4880         902 :         if (pg_strcasecmp(tok, "all") == 0)
    4881             :         {
    4882      230786 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    4883      229888 :                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL)
    4884        8980 :                     newwalconsistency[rmid] = true;
    4885             :         }
    4886             :         else
    4887             :         {
    4888             :             /* Check if the token matches any known resource manager. */
    4889           4 :             bool        found = false;
    4890             : 
    4891          72 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    4892             :             {
    4893         108 :                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL &&
    4894          36 :                     pg_strcasecmp(tok, GetRmgr(rmid).rm_name) == 0)
    4895             :                 {
    4896           4 :                     newwalconsistency[rmid] = true;
    4897           4 :                     found = true;
    4898           4 :                     break;
    4899             :                 }
    4900             :             }
    4901           4 :             if (!found)
    4902             :             {
    4903             :                 /*
    4904             :                  * During startup, it might be a not-yet-loaded custom
    4905             :                  * resource manager.  Defer checking until
    4906             :                  * InitializeWalConsistencyChecking().
    4907             :                  */
    4908           0 :                 if (!process_shared_preload_libraries_done)
    4909             :                 {
    4910           0 :                     check_wal_consistency_checking_deferred = true;
    4911             :                 }
    4912             :                 else
    4913             :                 {
    4914           0 :                     GUC_check_errdetail("Unrecognized key word: \"%s\".", tok);
    4915           0 :                     pfree(rawstring);
    4916           0 :                     list_free(elemlist);
    4917           0 :                     return false;
    4918             :                 }
    4919             :             }
    4920             :         }
    4921             :     }
    4922             : 
    4923        4000 :     pfree(rawstring);
    4924        4000 :     list_free(elemlist);
    4925             : 
    4926             :     /* assign new value */
    4927        4000 :     *extra = guc_malloc(LOG, (RM_MAX_ID + 1) * sizeof(bool));
    4928        4000 :     if (!*extra)
    4929           0 :         return false;
    4930        4000 :     memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool));
    4931        4000 :     return true;
    4932             : }
    4933             : 
    4934             : /*
    4935             :  * GUC assign_hook for wal_consistency_checking
    4936             :  */
    4937             : void
    4938        3998 : assign_wal_consistency_checking(const char *newval, void *extra)
    4939             : {
    4940             :     /*
    4941             :      * If some checks were deferred, it's possible that the checks will fail
    4942             :      * later during InitializeWalConsistencyChecking(). But in that case, the
    4943             :      * postmaster will exit anyway, so it's safe to proceed with the
    4944             :      * assignment.
    4945             :      *
    4946             :      * Any built-in resource managers specified are assigned immediately,
    4947             :      * which affects WAL created before shared_preload_libraries are
    4948             :      * processed. Any custom resource managers specified won't be assigned
    4949             :      * until after shared_preload_libraries are processed, but that's OK
    4950             :      * because WAL for a custom resource manager can't be written before the
    4951             :      * module is loaded anyway.
    4952             :      */
    4953        3998 :     wal_consistency_checking = extra;
    4954        3998 : }
    4955             : 
    4956             : /*
    4957             :  * InitializeWalConsistencyChecking: run after loading custom resource managers
    4958             :  *
    4959             :  * If any unknown resource managers were specified in the
    4960             :  * wal_consistency_checking GUC, processing was deferred.  Now that
    4961             :  * shared_preload_libraries have been loaded, process wal_consistency_checking
    4962             :  * again.
    4963             :  */
    4964             : void
    4965        1824 : InitializeWalConsistencyChecking(void)
    4966             : {
    4967             :     Assert(process_shared_preload_libraries_done);
    4968             : 
    4969        1824 :     if (check_wal_consistency_checking_deferred)
    4970             :     {
    4971             :         struct config_generic *guc;
    4972             : 
    4973           0 :         guc = find_option("wal_consistency_checking", false, false, ERROR);
    4974             : 
    4975           0 :         check_wal_consistency_checking_deferred = false;
    4976             : 
    4977           0 :         set_config_option_ext("wal_consistency_checking",
    4978             :                               wal_consistency_checking_string,
    4979             :                               guc->scontext, guc->source, guc->srole,
    4980             :                               GUC_ACTION_SET, true, ERROR, false);
    4981             : 
    4982             :         /* checking should not be deferred again */
    4983             :         Assert(!check_wal_consistency_checking_deferred);
    4984             :     }
    4985        1824 : }
    4986             : 
    4987             : /*
    4988             :  * GUC show_hook for archive_command
    4989             :  */
    4990             : const char *
    4991        3556 : show_archive_command(void)
    4992             : {
    4993        3556 :     if (XLogArchivingActive())
    4994           4 :         return XLogArchiveCommand;
    4995             :     else
    4996        3552 :         return "(disabled)";
    4997             : }
    4998             : 
    4999             : /*
    5000             :  * GUC show_hook for in_hot_standby
    5001             :  */
    5002             : const char *
    5003       29150 : show_in_hot_standby(void)
    5004             : {
    5005             :     /*
    5006             :      * We display the actual state based on shared memory, so that this GUC
    5007             :      * reports up-to-date state if examined intra-query.  The underlying
    5008             :      * variable (in_hot_standby_guc) changes only when we transmit a new value
    5009             :      * to the client.
    5010             :      */
    5011       29150 :     return RecoveryInProgress() ? "on" : "off";
    5012             : }
    5013             : 
    5014             : /*
    5015             :  * Read the control file, set respective GUCs.
    5016             :  *
    5017             :  * This is to be called during startup, including a crash recovery cycle,
    5018             :  * unless in bootstrap mode, where no control file yet exists.  As there's no
    5019             :  * usable shared memory yet (its sizing can depend on the contents of the
    5020             :  * control file!), first store the contents in local memory. XLOGShmemInit()
    5021             :  * will then copy it to shared memory later.
    5022             :  *
    5023             :  * reset just controls whether previous contents are to be expected (in the
    5024             :  * reset case, there's a dangling pointer into old shared memory), or not.
    5025             :  */
    5026             : void
    5027        1844 : LocalProcessControlFile(bool reset)
    5028             : {
    5029             :     Assert(reset || ControlFile == NULL);
    5030        1844 :     ControlFile = palloc(sizeof(ControlFileData));
    5031        1844 :     ReadControlFile();
    5032        1844 : }
    5033             : 
    5034             : /*
    5035             :  * Get the wal_level from the control file. For a standby, this value should be
    5036             :  * considered as its active wal_level, because it may be different from what
    5037             :  * was originally configured on standby.
    5038             :  */
    5039             : WalLevel
    5040         138 : GetActiveWalLevelOnStandby(void)
    5041             : {
    5042         138 :     return ControlFile->wal_level;
    5043             : }
    5044             : 
    5045             : /*
    5046             :  * Initialization of shared memory for XLOG
    5047             :  */
    5048             : Size
    5049        6096 : XLOGShmemSize(void)
    5050             : {
    5051             :     Size        size;
    5052             : 
    5053             :     /*
    5054             :      * If the value of wal_buffers is -1, use the preferred auto-tune value.
    5055             :      * This isn't an amazingly clean place to do this, but we must wait till
    5056             :      * NBuffers has received its final value, and must do it before using the
    5057             :      * value of XLOGbuffers to do anything important.
    5058             :      *
    5059             :      * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
    5060             :      * However, if the DBA explicitly set wal_buffers = -1 in the config file,
    5061             :      * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force
    5062             :      * the matter with PGC_S_OVERRIDE.
    5063             :      */
    5064        6096 :     if (XLOGbuffers == -1)
    5065             :     {
    5066             :         char        buf[32];
    5067             : 
    5068        2130 :         snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
    5069        2130 :         SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
    5070             :                         PGC_S_DYNAMIC_DEFAULT);
    5071        2130 :         if (XLOGbuffers == -1)  /* failed to apply it? */
    5072           0 :             SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
    5073             :                             PGC_S_OVERRIDE);
    5074             :     }
    5075             :     Assert(XLOGbuffers > 0);
    5076             : 
    5077             :     /* XLogCtl */
    5078        6096 :     size = sizeof(XLogCtlData);
    5079             : 
    5080             :     /* WAL insertion locks, plus alignment */
    5081        6096 :     size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
    5082             :     /* xlblocks array */
    5083        6096 :     size = add_size(size, mul_size(sizeof(pg_atomic_uint64), XLOGbuffers));
    5084             :     /* extra alignment padding for XLOG I/O buffers */
    5085        6096 :     size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
    5086             :     /* and the buffers themselves */
    5087        6096 :     size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
    5088             : 
    5089             :     /*
    5090             :      * Note: we don't count ControlFileData, it comes out of the "slop factor"
    5091             :      * added by CreateSharedMemoryAndSemaphores.  This lets us use this
    5092             :      * routine again below to compute the actual allocation size.
    5093             :      */
    5094             : 
    5095        6096 :     return size;
    5096             : }
    5097             : 
    5098             : void
    5099        2134 : XLOGShmemInit(void)
    5100             : {
    5101             :     bool        foundCFile,
    5102             :                 foundXLog;
    5103             :     char       *allocptr;
    5104             :     int         i;
    5105             :     ControlFileData *localControlFile;
    5106             : 
    5107             : #ifdef WAL_DEBUG
    5108             : 
    5109             :     /*
    5110             :      * Create a memory context for WAL debugging that's exempt from the normal
    5111             :      * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
    5112             :      * an allocation fails, but wal_debug is not for production use anyway.
    5113             :      */
    5114             :     if (walDebugCxt == NULL)
    5115             :     {
    5116             :         walDebugCxt = AllocSetContextCreate(TopMemoryContext,
    5117             :                                             "WAL Debug",
    5118             :                                             ALLOCSET_DEFAULT_SIZES);
    5119             :         MemoryContextAllowInCriticalSection(walDebugCxt, true);
    5120             :     }
    5121             : #endif
    5122             : 
    5123             : 
    5124        2134 :     XLogCtl = (XLogCtlData *)
    5125        2134 :         ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
    5126             : 
    5127        2134 :     localControlFile = ControlFile;
    5128        2134 :     ControlFile = (ControlFileData *)
    5129        2134 :         ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
    5130             : 
    5131        2134 :     if (foundCFile || foundXLog)
    5132             :     {
    5133             :         /* both should be present or neither */
    5134             :         Assert(foundCFile && foundXLog);
    5135             : 
    5136             :         /* Initialize local copy of WALInsertLocks */
    5137           0 :         WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
    5138             : 
    5139           0 :         if (localControlFile)
    5140           0 :             pfree(localControlFile);
    5141           0 :         return;
    5142             :     }
    5143        2134 :     memset(XLogCtl, 0, sizeof(XLogCtlData));
    5144             : 
    5145             :     /*
    5146             :      * Already have read control file locally, unless in bootstrap mode. Move
    5147             :      * contents into shared memory.
    5148             :      */
    5149        2134 :     if (localControlFile)
    5150             :     {
    5151        1828 :         memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
    5152        1828 :         pfree(localControlFile);
    5153             :     }
    5154             : 
    5155             :     /*
    5156             :      * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
    5157             :      * multiple of the alignment for same, so no extra alignment padding is
    5158             :      * needed here.
    5159             :      */
    5160        2134 :     allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
    5161        2134 :     XLogCtl->xlblocks = (pg_atomic_uint64 *) allocptr;
    5162        2134 :     allocptr += sizeof(pg_atomic_uint64) * XLOGbuffers;
    5163             : 
    5164      629808 :     for (i = 0; i < XLOGbuffers; i++)
    5165             :     {
    5166      627674 :         pg_atomic_init_u64(&XLogCtl->xlblocks[i], InvalidXLogRecPtr);
    5167             :     }
    5168             : 
    5169             :     /* WAL insertion locks. Ensure they're aligned to the full padded size */
    5170        2134 :     allocptr += sizeof(WALInsertLockPadded) -
    5171        2134 :         ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
    5172        2134 :     WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
    5173             :         (WALInsertLockPadded *) allocptr;
    5174        2134 :     allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
    5175             : 
    5176       19206 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    5177             :     {
    5178       17072 :         LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
    5179       17072 :         pg_atomic_init_u64(&WALInsertLocks[i].l.insertingAt, InvalidXLogRecPtr);
    5180       17072 :         WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
    5181             :     }
    5182             : 
    5183             :     /*
    5184             :      * Align the start of the page buffers to a full xlog block size boundary.
    5185             :      * This simplifies some calculations in XLOG insertion. It is also
    5186             :      * required for O_DIRECT.
    5187             :      */
    5188        2134 :     allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
    5189        2134 :     XLogCtl->pages = allocptr;
    5190        2134 :     memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
    5191             : 
    5192             :     /*
    5193             :      * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
    5194             :      * in additional info.)
    5195             :      */
    5196        2134 :     XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
    5197        2134 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
    5198        2134 :     XLogCtl->InstallXLogFileSegmentActive = false;
    5199        2134 :     XLogCtl->WalWriterSleeping = false;
    5200             : 
    5201        2134 :     SpinLockInit(&XLogCtl->Insert.insertpos_lck);
    5202        2134 :     SpinLockInit(&XLogCtl->info_lck);
    5203        2134 :     pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr);
    5204        2134 :     pg_atomic_init_u64(&XLogCtl->logWriteResult, InvalidXLogRecPtr);
    5205        2134 :     pg_atomic_init_u64(&XLogCtl->logFlushResult, InvalidXLogRecPtr);
    5206        2134 :     pg_atomic_init_u64(&XLogCtl->unloggedLSN, InvalidXLogRecPtr);
    5207             : 
    5208        2134 :     pg_atomic_init_u64(&XLogCtl->InitializeReserved, InvalidXLogRecPtr);
    5209        2134 :     pg_atomic_init_u64(&XLogCtl->InitializedUpTo, InvalidXLogRecPtr);
    5210        2134 :     ConditionVariableInit(&XLogCtl->InitializedUpToCondVar);
    5211             : }
    5212             : 
    5213             : /*
    5214             :  * This func must be called ONCE on system install.  It creates pg_control
    5215             :  * and the initial XLOG segment.
    5216             :  */
    5217             : void
    5218         102 : BootStrapXLOG(uint32 data_checksum_version)
    5219             : {
    5220             :     CheckPoint  checkPoint;
    5221             :     char       *buffer;
    5222             :     XLogPageHeader page;
    5223             :     XLogLongPageHeader longpage;
    5224             :     XLogRecord *record;
    5225             :     char       *recptr;
    5226             :     uint64      sysidentifier;
    5227             :     struct timeval tv;
    5228             :     pg_crc32c   crc;
    5229             : 
    5230             :     /* allow ordinary WAL segment creation, like StartupXLOG() would */
    5231         102 :     SetInstallXLogFileSegmentActive();
    5232             : 
    5233             :     /*
    5234             :      * Select a hopefully-unique system identifier code for this installation.
    5235             :      * We use the result of gettimeofday(), including the fractional seconds
    5236             :      * field, as being about as unique as we can easily get.  (Think not to
    5237             :      * use random(), since it hasn't been seeded and there's no portable way
    5238             :      * to seed it other than the system clock value...)  The upper half of the
    5239             :      * uint64 value is just the tv_sec part, while the lower half contains the
    5240             :      * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
    5241             :      * PID for a little extra uniqueness.  A person knowing this encoding can
    5242             :      * determine the initialization time of the installation, which could
    5243             :      * perhaps be useful sometimes.
    5244             :      */
    5245         102 :     gettimeofday(&tv, NULL);
    5246         102 :     sysidentifier = ((uint64) tv.tv_sec) << 32;
    5247         102 :     sysidentifier |= ((uint64) tv.tv_usec) << 12;
    5248         102 :     sysidentifier |= getpid() & 0xFFF;
    5249             : 
    5250             :     /* page buffer must be aligned suitably for O_DIRECT */
    5251         102 :     buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
    5252         102 :     page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
    5253         102 :     memset(page, 0, XLOG_BLCKSZ);
    5254             : 
    5255             :     /*
    5256             :      * Set up information for the initial checkpoint record
    5257             :      *
    5258             :      * The initial checkpoint record is written to the beginning of the WAL
    5259             :      * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
    5260             :      * used, so that we can use 0/0 to mean "before any valid WAL segment".
    5261             :      */
    5262         102 :     checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
    5263         102 :     checkPoint.ThisTimeLineID = BootstrapTimeLineID;
    5264         102 :     checkPoint.PrevTimeLineID = BootstrapTimeLineID;
    5265         102 :     checkPoint.fullPageWrites = fullPageWrites;
    5266         102 :     checkPoint.wal_level = wal_level;
    5267             :     checkPoint.nextXid =
    5268         102 :         FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
    5269         102 :     checkPoint.nextOid = FirstGenbkiObjectId;
    5270         102 :     checkPoint.nextMulti = FirstMultiXactId;
    5271         102 :     checkPoint.nextMultiOffset = 0;
    5272         102 :     checkPoint.oldestXid = FirstNormalTransactionId;
    5273         102 :     checkPoint.oldestXidDB = Template1DbOid;
    5274         102 :     checkPoint.oldestMulti = FirstMultiXactId;
    5275         102 :     checkPoint.oldestMultiDB = Template1DbOid;
    5276         102 :     checkPoint.oldestCommitTsXid = InvalidTransactionId;
    5277         102 :     checkPoint.newestCommitTsXid = InvalidTransactionId;
    5278         102 :     checkPoint.time = (pg_time_t) time(NULL);
    5279         102 :     checkPoint.oldestActiveXid = InvalidTransactionId;
    5280             : 
    5281         102 :     TransamVariables->nextXid = checkPoint.nextXid;
    5282         102 :     TransamVariables->nextOid = checkPoint.nextOid;
    5283         102 :     TransamVariables->oidCount = 0;
    5284         102 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    5285         102 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    5286         102 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    5287         102 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
    5288         102 :     SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
    5289             : 
    5290             :     /* Set up the XLOG page header */
    5291         102 :     page->xlp_magic = XLOG_PAGE_MAGIC;
    5292         102 :     page->xlp_info = XLP_LONG_HEADER;
    5293         102 :     page->xlp_tli = BootstrapTimeLineID;
    5294         102 :     page->xlp_pageaddr = wal_segment_size;
    5295         102 :     longpage = (XLogLongPageHeader) page;
    5296         102 :     longpage->xlp_sysid = sysidentifier;
    5297         102 :     longpage->xlp_seg_size = wal_segment_size;
    5298         102 :     longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    5299             : 
    5300             :     /* Insert the initial checkpoint record */
    5301         102 :     recptr = ((char *) page + SizeOfXLogLongPHD);
    5302         102 :     record = (XLogRecord *) recptr;
    5303         102 :     record->xl_prev = 0;
    5304         102 :     record->xl_xid = InvalidTransactionId;
    5305         102 :     record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
    5306         102 :     record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
    5307         102 :     record->xl_rmid = RM_XLOG_ID;
    5308         102 :     recptr += SizeOfXLogRecord;
    5309             :     /* fill the XLogRecordDataHeaderShort struct */
    5310         102 :     *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
    5311         102 :     *(recptr++) = sizeof(checkPoint);
    5312         102 :     memcpy(recptr, &checkPoint, sizeof(checkPoint));
    5313         102 :     recptr += sizeof(checkPoint);
    5314             :     Assert(recptr - (char *) record == record->xl_tot_len);
    5315             : 
    5316         102 :     INIT_CRC32C(crc);
    5317         102 :     COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
    5318         102 :     COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
    5319         102 :     FIN_CRC32C(crc);
    5320         102 :     record->xl_crc = crc;
    5321             : 
    5322             :     /* Create first XLOG segment file */
    5323         102 :     openLogTLI = BootstrapTimeLineID;
    5324         102 :     openLogFile = XLogFileInit(1, BootstrapTimeLineID);
    5325             : 
    5326             :     /*
    5327             :      * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
    5328             :      * close the file again in a moment.
    5329             :      */
    5330             : 
    5331             :     /* Write the first page with the initial record */
    5332         102 :     errno = 0;
    5333         102 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
    5334         102 :     if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    5335             :     {
    5336             :         /* if write didn't set errno, assume problem is no disk space */
    5337           0 :         if (errno == 0)
    5338           0 :             errno = ENOSPC;
    5339           0 :         ereport(PANIC,
    5340             :                 (errcode_for_file_access(),
    5341             :                  errmsg("could not write bootstrap write-ahead log file: %m")));
    5342             :     }
    5343         102 :     pgstat_report_wait_end();
    5344             : 
    5345         102 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
    5346         102 :     if (pg_fsync(openLogFile) != 0)
    5347           0 :         ereport(PANIC,
    5348             :                 (errcode_for_file_access(),
    5349             :                  errmsg("could not fsync bootstrap write-ahead log file: %m")));
    5350         102 :     pgstat_report_wait_end();
    5351             : 
    5352         102 :     if (close(openLogFile) != 0)
    5353           0 :         ereport(PANIC,
    5354             :                 (errcode_for_file_access(),
    5355             :                  errmsg("could not close bootstrap write-ahead log file: %m")));
    5356             : 
    5357         102 :     openLogFile = -1;
    5358             : 
    5359             :     /* Now create pg_control */
    5360         102 :     InitControlFile(sysidentifier, data_checksum_version);
    5361         102 :     ControlFile->time = checkPoint.time;
    5362         102 :     ControlFile->checkPoint = checkPoint.redo;
    5363         102 :     ControlFile->checkPointCopy = checkPoint;
    5364             : 
    5365             :     /* some additional ControlFile fields are set in WriteControlFile() */
    5366         102 :     WriteControlFile();
    5367             : 
    5368             :     /* Bootstrap the commit log, too */
    5369         102 :     BootStrapCLOG();
    5370         102 :     BootStrapCommitTs();
    5371         102 :     BootStrapSUBTRANS();
    5372         102 :     BootStrapMultiXact();
    5373             : 
    5374         102 :     pfree(buffer);
    5375             : 
    5376             :     /*
    5377             :      * Force control file to be read - in contrast to normal processing we'd
    5378             :      * otherwise never run the checks and GUC related initializations therein.
    5379             :      */
    5380         102 :     ReadControlFile();
    5381         102 : }
    5382             : 
    5383             : static char *
    5384        1642 : str_time(pg_time_t tnow)
    5385             : {
    5386        1642 :     char       *buf = palloc(128);
    5387             : 
    5388        1642 :     pg_strftime(buf, 128,
    5389             :                 "%Y-%m-%d %H:%M:%S %Z",
    5390        1642 :                 pg_localtime(&tnow, log_timezone));
    5391             : 
    5392        1642 :     return buf;
    5393             : }
    5394             : 
    5395             : /*
    5396             :  * Initialize the first WAL segment on new timeline.
    5397             :  */
    5398             : static void
    5399          98 : XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
    5400             : {
    5401             :     char        xlogfname[MAXFNAMELEN];
    5402             :     XLogSegNo   endLogSegNo;
    5403             :     XLogSegNo   startLogSegNo;
    5404             : 
    5405             :     /* we always switch to a new timeline after archive recovery */
    5406             :     Assert(endTLI != newTLI);
    5407             : 
    5408             :     /*
    5409             :      * Update min recovery point one last time.
    5410             :      */
    5411          98 :     UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    5412             : 
    5413             :     /*
    5414             :      * Calculate the last segment on the old timeline, and the first segment
    5415             :      * on the new timeline. If the switch happens in the middle of a segment,
    5416             :      * they are the same, but if the switch happens exactly at a segment
    5417             :      * boundary, startLogSegNo will be endLogSegNo + 1.
    5418             :      */
    5419          98 :     XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
    5420          98 :     XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
    5421             : 
    5422             :     /*
    5423             :      * Initialize the starting WAL segment for the new timeline. If the switch
    5424             :      * happens in the middle of a segment, copy data from the last WAL segment
    5425             :      * of the old timeline up to the switch point, to the starting WAL segment
    5426             :      * on the new timeline.
    5427             :      */
    5428          98 :     if (endLogSegNo == startLogSegNo)
    5429             :     {
    5430             :         /*
    5431             :          * Make a copy of the file on the new timeline.
    5432             :          *
    5433             :          * Writing WAL isn't allowed yet, so there are no locking
    5434             :          * considerations. But we should be just as tense as XLogFileInit to
    5435             :          * avoid emplacing a bogus file.
    5436             :          */
    5437          80 :         XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
    5438          80 :                      XLogSegmentOffset(endOfLog, wal_segment_size));
    5439             :     }
    5440             :     else
    5441             :     {
    5442             :         /*
    5443             :          * The switch happened at a segment boundary, so just create the next
    5444             :          * segment on the new timeline.
    5445             :          */
    5446             :         int         fd;
    5447             : 
    5448          18 :         fd = XLogFileInit(startLogSegNo, newTLI);
    5449             : 
    5450          18 :         if (close(fd) != 0)
    5451             :         {
    5452           0 :             int         save_errno = errno;
    5453             : 
    5454           0 :             XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
    5455           0 :             errno = save_errno;
    5456           0 :             ereport(ERROR,
    5457             :                     (errcode_for_file_access(),
    5458             :                      errmsg("could not close file \"%s\": %m", xlogfname)));
    5459             :         }
    5460             :     }
    5461             : 
    5462             :     /*
    5463             :      * Let's just make real sure there are not .ready or .done flags posted
    5464             :      * for the new segment.
    5465             :      */
    5466          98 :     XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
    5467          98 :     XLogArchiveCleanup(xlogfname);
    5468          98 : }
    5469             : 
    5470             : /*
    5471             :  * Perform cleanup actions at the conclusion of archive recovery.
    5472             :  */
    5473             : static void
    5474          98 : CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
    5475             :                             TimeLineID newTLI)
    5476             : {
    5477             :     /*
    5478             :      * Execute the recovery_end_command, if any.
    5479             :      */
    5480          98 :     if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
    5481           4 :         ExecuteRecoveryCommand(recoveryEndCommand,
    5482             :                                "recovery_end_command",
    5483             :                                true,
    5484             :                                WAIT_EVENT_RECOVERY_END_COMMAND);
    5485             : 
    5486             :     /*
    5487             :      * We switched to a new timeline. Clean up segments on the old timeline.
    5488             :      *
    5489             :      * If there are any higher-numbered segments on the old timeline, remove
    5490             :      * them. They might contain valid WAL, but they might also be
    5491             :      * pre-allocated files containing garbage. In any case, they are not part
    5492             :      * of the new timeline's history so we don't need them.
    5493             :      */
    5494          98 :     RemoveNonParentXlogFiles(EndOfLog, newTLI);
    5495             : 
    5496             :     /*
    5497             :      * If the switch happened in the middle of a segment, what to do with the
    5498             :      * last, partial segment on the old timeline? If we don't archive it, and
    5499             :      * the server that created the WAL never archives it either (e.g. because
    5500             :      * it was hit by a meteor), it will never make it to the archive. That's
    5501             :      * OK from our point of view, because the new segment that we created with
    5502             :      * the new TLI contains all the WAL from the old timeline up to the switch
    5503             :      * point. But if you later try to do PITR to the "missing" WAL on the old
    5504             :      * timeline, recovery won't find it in the archive. It's physically
    5505             :      * present in the new file with new TLI, but recovery won't look there
    5506             :      * when it's recovering to the older timeline. On the other hand, if we
    5507             :      * archive the partial segment, and the original server on that timeline
    5508             :      * is still running and archives the completed version of the same segment
    5509             :      * later, it will fail. (We used to do that in 9.4 and below, and it
    5510             :      * caused such problems).
    5511             :      *
    5512             :      * As a compromise, we rename the last segment with the .partial suffix,
    5513             :      * and archive it. Archive recovery will never try to read .partial
    5514             :      * segments, so they will normally go unused. But in the odd PITR case,
    5515             :      * the administrator can copy them manually to the pg_wal directory
    5516             :      * (removing the suffix). They can be useful in debugging, too.
    5517             :      *
    5518             :      * If a .done or .ready file already exists for the old timeline, however,
    5519             :      * we had already determined that the segment is complete, so we can let
    5520             :      * it be archived normally. (In particular, if it was restored from the
    5521             :      * archive to begin with, it's expected to have a .done file).
    5522             :      */
    5523          98 :     if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
    5524             :         XLogArchivingActive())
    5525             :     {
    5526             :         char        origfname[MAXFNAMELEN];
    5527             :         XLogSegNo   endLogSegNo;
    5528             : 
    5529          20 :         XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
    5530          20 :         XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
    5531             : 
    5532          20 :         if (!XLogArchiveIsReadyOrDone(origfname))
    5533             :         {
    5534             :             char        origpath[MAXPGPATH];
    5535             :             char        partialfname[MAXFNAMELEN];
    5536             :             char        partialpath[MAXPGPATH];
    5537             : 
    5538             :             /*
    5539             :              * If we're summarizing WAL, we can't rename the partial file
    5540             :              * until the summarizer finishes with it, else it will fail.
    5541             :              */
    5542          12 :             if (summarize_wal)
    5543           2 :                 WaitForWalSummarization(EndOfLog);
    5544             : 
    5545          12 :             XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
    5546          12 :             snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
    5547          12 :             snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
    5548             : 
    5549             :             /*
    5550             :              * Make sure there's no .done or .ready file for the .partial
    5551             :              * file.
    5552             :              */
    5553          12 :             XLogArchiveCleanup(partialfname);
    5554             : 
    5555          12 :             durable_rename(origpath, partialpath, ERROR);
    5556          12 :             XLogArchiveNotify(partialfname);
    5557             :         }
    5558             :     }
    5559          98 : }
    5560             : 
    5561             : /*
    5562             :  * Check to see if required parameters are set high enough on this server
    5563             :  * for various aspects of recovery operation.
    5564             :  *
    5565             :  * Note that all the parameters which this function tests need to be
    5566             :  * listed in Administrator's Overview section in high-availability.sgml.
    5567             :  * If you change them, don't forget to update the list.
    5568             :  */
    5569             : static void
    5570         490 : CheckRequiredParameterValues(void)
    5571             : {
    5572             :     /*
    5573             :      * For archive recovery, the WAL must be generated with at least 'replica'
    5574             :      * wal_level.
    5575             :      */
    5576         490 :     if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
    5577             :     {
    5578           4 :         ereport(FATAL,
    5579             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    5580             :                  errmsg("WAL was generated with \"wal_level=minimal\", cannot continue recovering"),
    5581             :                  errdetail("This happens if you temporarily set \"wal_level=minimal\" on the server."),
    5582             :                  errhint("Use a backup taken after setting \"wal_level\" to higher than \"minimal\".")));
    5583             :     }
    5584             : 
    5585             :     /*
    5586             :      * For Hot Standby, the WAL must be generated with 'replica' mode, and we
    5587             :      * must have at least as many backend slots as the primary.
    5588             :      */
    5589         486 :     if (ArchiveRecoveryRequested && EnableHotStandby)
    5590             :     {
    5591             :         /* We ignore autovacuum_worker_slots when we make this test. */
    5592         246 :         RecoveryRequiresIntParameter("max_connections",
    5593             :                                      MaxConnections,
    5594         246 :                                      ControlFile->MaxConnections);
    5595         246 :         RecoveryRequiresIntParameter("max_worker_processes",
    5596             :                                      max_worker_processes,
    5597         246 :                                      ControlFile->max_worker_processes);
    5598         246 :         RecoveryRequiresIntParameter("max_wal_senders",
    5599             :                                      max_wal_senders,
    5600         246 :                                      ControlFile->max_wal_senders);
    5601         246 :         RecoveryRequiresIntParameter("max_prepared_transactions",
    5602             :                                      max_prepared_xacts,
    5603         246 :                                      ControlFile->max_prepared_xacts);
    5604         246 :         RecoveryRequiresIntParameter("max_locks_per_transaction",
    5605             :                                      max_locks_per_xact,
    5606         246 :                                      ControlFile->max_locks_per_xact);
    5607             :     }
    5608         486 : }
    5609             : 
    5610             : /*
    5611             :  * This must be called ONCE during postmaster or standalone-backend startup
    5612             :  */
    5613             : void
    5614        1846 : StartupXLOG(void)
    5615             : {
    5616             :     XLogCtlInsert *Insert;
    5617             :     CheckPoint  checkPoint;
    5618             :     bool        wasShutdown;
    5619             :     bool        didCrash;
    5620             :     bool        haveTblspcMap;
    5621             :     bool        haveBackupLabel;
    5622             :     XLogRecPtr  EndOfLog;
    5623             :     TimeLineID  EndOfLogTLI;
    5624             :     TimeLineID  newTLI;
    5625             :     bool        performedWalRecovery;
    5626             :     EndOfWalRecoveryInfo *endOfRecoveryInfo;
    5627             :     XLogRecPtr  abortedRecPtr;
    5628             :     XLogRecPtr  missingContrecPtr;
    5629             :     TransactionId oldestActiveXID;
    5630        1846 :     bool        promoted = false;
    5631             : 
    5632             :     /*
    5633             :      * We should have an aux process resource owner to use, and we should not
    5634             :      * be in a transaction that's installed some other resowner.
    5635             :      */
    5636             :     Assert(AuxProcessResourceOwner != NULL);
    5637             :     Assert(CurrentResourceOwner == NULL ||
    5638             :            CurrentResourceOwner == AuxProcessResourceOwner);
    5639        1846 :     CurrentResourceOwner = AuxProcessResourceOwner;
    5640             : 
    5641             :     /*
    5642             :      * Check that contents look valid.
    5643             :      */
    5644        1846 :     if (!XRecOffIsValid(ControlFile->checkPoint))
    5645           0 :         ereport(FATAL,
    5646             :                 (errcode(ERRCODE_DATA_CORRUPTED),
    5647             :                  errmsg("control file contains invalid checkpoint location")));
    5648             : 
    5649        1846 :     switch (ControlFile->state)
    5650             :     {
    5651        1434 :         case DB_SHUTDOWNED:
    5652             : 
    5653             :             /*
    5654             :              * This is the expected case, so don't be chatty in standalone
    5655             :              * mode
    5656             :              */
    5657        1434 :             ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    5658             :                     (errmsg("database system was shut down at %s",
    5659             :                             str_time(ControlFile->time))));
    5660        1434 :             break;
    5661             : 
    5662          64 :         case DB_SHUTDOWNED_IN_RECOVERY:
    5663          64 :             ereport(LOG,
    5664             :                     (errmsg("database system was shut down in recovery at %s",
    5665             :                             str_time(ControlFile->time))));
    5666          64 :             break;
    5667             : 
    5668           0 :         case DB_SHUTDOWNING:
    5669           0 :             ereport(LOG,
    5670             :                     (errmsg("database system shutdown was interrupted; last known up at %s",
    5671             :                             str_time(ControlFile->time))));
    5672           0 :             break;
    5673             : 
    5674           0 :         case DB_IN_CRASH_RECOVERY:
    5675           0 :             ereport(LOG,
    5676             :                     (errmsg("database system was interrupted while in recovery at %s",
    5677             :                             str_time(ControlFile->time)),
    5678             :                      errhint("This probably means that some data is corrupted and"
    5679             :                              " you will have to use the last backup for recovery.")));
    5680           0 :             break;
    5681             : 
    5682          12 :         case DB_IN_ARCHIVE_RECOVERY:
    5683          12 :             ereport(LOG,
    5684             :                     (errmsg("database system was interrupted while in recovery at log time %s",
    5685             :                             str_time(ControlFile->checkPointCopy.time)),
    5686             :                      errhint("If this has occurred more than once some data might be corrupted"
    5687             :                              " and you might need to choose an earlier recovery target.")));
    5688          12 :             break;
    5689             : 
    5690         336 :         case DB_IN_PRODUCTION:
    5691         336 :             ereport(LOG,
    5692             :                     (errmsg("database system was interrupted; last known up at %s",
    5693             :                             str_time(ControlFile->time))));
    5694         336 :             break;
    5695             : 
    5696           0 :         default:
    5697           0 :             ereport(FATAL,
    5698             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    5699             :                      errmsg("control file contains invalid database cluster state")));
    5700             :     }
    5701             : 
    5702             :     /* This is just to allow attaching to startup process with a debugger */
    5703             : #ifdef XLOG_REPLAY_DELAY
    5704             :     if (ControlFile->state != DB_SHUTDOWNED)
    5705             :         pg_usleep(60000000L);
    5706             : #endif
    5707             : 
    5708             :     /*
    5709             :      * Verify that pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
    5710             :      * In cases where someone has performed a copy for PITR, these directories
    5711             :      * may have been excluded and need to be re-created.
    5712             :      */
    5713        1846 :     ValidateXLOGDirectoryStructure();
    5714             : 
    5715             :     /* Set up timeout handler needed to report startup progress. */
    5716        1846 :     if (!IsBootstrapProcessingMode())
    5717        1744 :         RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
    5718             :                         startup_progress_timeout_handler);
    5719             : 
    5720             :     /*----------
    5721             :      * If we previously crashed, perform a couple of actions:
    5722             :      *
    5723             :      * - The pg_wal directory may still include some temporary WAL segments
    5724             :      *   used when creating a new segment, so perform some clean up to not
    5725             :      *   bloat this path.  This is done first as there is no point to sync
    5726             :      *   this temporary data.
    5727             :      *
    5728             :      * - There might be data which we had written, intending to fsync it, but
    5729             :      *   which we had not actually fsync'd yet.  Therefore, a power failure in
    5730             :      *   the near future might cause earlier unflushed writes to be lost, even
    5731             :      *   though more recent data written to disk from here on would be
    5732             :      *   persisted.  To avoid that, fsync the entire data directory.
    5733             :      */
    5734        1846 :     if (ControlFile->state != DB_SHUTDOWNED &&
    5735         412 :         ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
    5736             :     {
    5737         348 :         RemoveTempXlogFiles();
    5738         348 :         SyncDataDirectory();
    5739         348 :         didCrash = true;
    5740             :     }
    5741             :     else
    5742        1498 :         didCrash = false;
    5743             : 
    5744             :     /*
    5745             :      * Prepare for WAL recovery if needed.
    5746             :      *
    5747             :      * InitWalRecovery analyzes the control file and the backup label file, if
    5748             :      * any.  It updates the in-memory ControlFile buffer according to the
    5749             :      * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
    5750             :      * It also applies the tablespace map file, if any.
    5751             :      */
    5752        1846 :     InitWalRecovery(ControlFile, &wasShutdown,
    5753             :                     &haveBackupLabel, &haveTblspcMap);
    5754        1846 :     checkPoint = ControlFile->checkPointCopy;
    5755             : 
    5756             :     /* initialize shared memory variables from the checkpoint record */
    5757        1846 :     TransamVariables->nextXid = checkPoint.nextXid;
    5758        1846 :     TransamVariables->nextOid = checkPoint.nextOid;
    5759        1846 :     TransamVariables->oidCount = 0;
    5760        1846 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    5761        1846 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    5762        1846 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    5763        1846 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
    5764        1846 :     SetCommitTsLimit(checkPoint.oldestCommitTsXid,
    5765             :                      checkPoint.newestCommitTsXid);
    5766        1846 :     XLogCtl->ckptFullXid = checkPoint.nextXid;
    5767             : 
    5768             :     /*
    5769             :      * Clear out any old relcache cache files.  This is *necessary* if we do
    5770             :      * any WAL replay, since that would probably result in the cache files
    5771             :      * being out of sync with database reality.  In theory we could leave them
    5772             :      * in place if the database had been cleanly shut down, but it seems
    5773             :      * safest to just remove them always and let them be rebuilt during the
    5774             :      * first backend startup.  These files needs to be removed from all
    5775             :      * directories including pg_tblspc, however the symlinks are created only
    5776             :      * after reading tablespace_map file in case of archive recovery from
    5777             :      * backup, so needs to clear old relcache files here after creating
    5778             :      * symlinks.
    5779             :      */
    5780        1846 :     RelationCacheInitFileRemove();
    5781             : 
    5782             :     /*
    5783             :      * Initialize replication slots, before there's a chance to remove
    5784             :      * required resources.
    5785             :      */
    5786        1846 :     StartupReplicationSlots();
    5787             : 
    5788             :     /*
    5789             :      * Startup logical state, needs to be setup now so we have proper data
    5790             :      * during crash recovery.
    5791             :      */
    5792        1844 :     StartupReorderBuffer();
    5793             : 
    5794             :     /*
    5795             :      * Startup CLOG. This must be done after TransamVariables->nextXid has
    5796             :      * been initialized and before we accept connections or begin WAL replay.
    5797             :      */
    5798        1844 :     StartupCLOG();
    5799             : 
    5800             :     /*
    5801             :      * Startup MultiXact. We need to do this early to be able to replay
    5802             :      * truncations.
    5803             :      */
    5804        1844 :     StartupMultiXact();
    5805             : 
    5806             :     /*
    5807             :      * Ditto for commit timestamps.  Activate the facility if the setting is
    5808             :      * enabled in the control file, as there should be no tracking of commit
    5809             :      * timestamps done when the setting was disabled.  This facility can be
    5810             :      * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
    5811             :      */
    5812        1844 :     if (ControlFile->track_commit_timestamp)
    5813          26 :         StartupCommitTs();
    5814             : 
    5815             :     /*
    5816             :      * Recover knowledge about replay progress of known replication partners.
    5817             :      */
    5818        1844 :     StartupReplicationOrigin();
    5819             : 
    5820             :     /*
    5821             :      * Initialize unlogged LSN. On a clean shutdown, it's restored from the
    5822             :      * control file. On recovery, all unlogged relations are blown away, so
    5823             :      * the unlogged LSN counter can be reset too.
    5824             :      */
    5825        1844 :     if (ControlFile->state == DB_SHUTDOWNED)
    5826        1420 :         pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
    5827        1420 :                                        ControlFile->unloggedLSN);
    5828             :     else
    5829         424 :         pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
    5830             :                                        FirstNormalUnloggedLSN);
    5831             : 
    5832             :     /*
    5833             :      * Copy any missing timeline history files between 'now' and the recovery
    5834             :      * target timeline from archive to pg_wal. While we don't need those files
    5835             :      * ourselves - the history file of the recovery target timeline covers all
    5836             :      * the previous timelines in the history too - a cascading standby server
    5837             :      * might be interested in them. Or, if you archive the WAL from this
    5838             :      * server to a different archive than the primary, it'd be good for all
    5839             :      * the history files to get archived there after failover, so that you can
    5840             :      * use one of the old timelines as a PITR target. Timeline history files
    5841             :      * are small, so it's better to copy them unnecessarily than not copy them
    5842             :      * and regret later.
    5843             :      */
    5844        1844 :     restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
    5845             : 
    5846             :     /*
    5847             :      * Before running in recovery, scan pg_twophase and fill in its status to
    5848             :      * be able to work on entries generated by redo.  Doing a scan before
    5849             :      * taking any recovery action has the merit to discard any 2PC files that
    5850             :      * are newer than the first record to replay, saving from any conflicts at
    5851             :      * replay.  This avoids as well any subsequent scans when doing recovery
    5852             :      * of the on-disk two-phase data.
    5853             :      */
    5854        1844 :     restoreTwoPhaseData();
    5855             : 
    5856             :     /*
    5857             :      * When starting with crash recovery, reset pgstat data - it might not be
    5858             :      * valid. Otherwise restore pgstat data. It's safe to do this here,
    5859             :      * because postmaster will not yet have started any other processes.
    5860             :      *
    5861             :      * NB: Restoring replication slot stats relies on slot state to have
    5862             :      * already been restored from disk.
    5863             :      *
    5864             :      * TODO: With a bit of extra work we could just start with a pgstat file
    5865             :      * associated with the checkpoint redo location we're starting from.
    5866             :      */
    5867        1844 :     if (didCrash)
    5868         348 :         pgstat_discard_stats();
    5869             :     else
    5870        1496 :         pgstat_restore_stats();
    5871             : 
    5872        1844 :     lastFullPageWrites = checkPoint.fullPageWrites;
    5873             : 
    5874        1844 :     RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    5875        1844 :     doPageWrites = lastFullPageWrites;
    5876             : 
    5877             :     /* REDO */
    5878        1844 :     if (InRecovery)
    5879             :     {
    5880             :         /* Initialize state for RecoveryInProgress() */
    5881         424 :         SpinLockAcquire(&XLogCtl->info_lck);
    5882         424 :         if (InArchiveRecovery)
    5883         222 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
    5884             :         else
    5885         202 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
    5886         424 :         SpinLockRelease(&XLogCtl->info_lck);
    5887             : 
    5888             :         /*
    5889             :          * Update pg_control to show that we are recovering and to show the
    5890             :          * selected checkpoint as the place we are starting from. We also mark
    5891             :          * pg_control with any minimum recovery stop point obtained from a
    5892             :          * backup history file.
    5893             :          *
    5894             :          * No need to hold ControlFileLock yet, we aren't up far enough.
    5895             :          */
    5896         424 :         UpdateControlFile();
    5897             : 
    5898             :         /*
    5899             :          * If there was a backup label file, it's done its job and the info
    5900             :          * has now been propagated into pg_control.  We must get rid of the
    5901             :          * label file so that if we crash during recovery, we'll pick up at
    5902             :          * the latest recovery restartpoint instead of going all the way back
    5903             :          * to the backup start point.  It seems prudent though to just rename
    5904             :          * the file out of the way rather than delete it completely.
    5905             :          */
    5906         424 :         if (haveBackupLabel)
    5907             :         {
    5908         142 :             unlink(BACKUP_LABEL_OLD);
    5909         142 :             durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
    5910             :         }
    5911             : 
    5912             :         /*
    5913             :          * If there was a tablespace_map file, it's done its job and the
    5914             :          * symlinks have been created.  We must get rid of the map file so
    5915             :          * that if we crash during recovery, we don't create symlinks again.
    5916             :          * It seems prudent though to just rename the file out of the way
    5917             :          * rather than delete it completely.
    5918             :          */
    5919         424 :         if (haveTblspcMap)
    5920             :         {
    5921           4 :             unlink(TABLESPACE_MAP_OLD);
    5922           4 :             durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
    5923             :         }
    5924             : 
    5925             :         /*
    5926             :          * Initialize our local copy of minRecoveryPoint.  When doing crash
    5927             :          * recovery we want to replay up to the end of WAL.  Particularly, in
    5928             :          * the case of a promoted standby minRecoveryPoint value in the
    5929             :          * control file is only updated after the first checkpoint.  However,
    5930             :          * if the instance crashes before the first post-recovery checkpoint
    5931             :          * is completed then recovery will use a stale location causing the
    5932             :          * startup process to think that there are still invalid page
    5933             :          * references when checking for data consistency.
    5934             :          */
    5935         424 :         if (InArchiveRecovery)
    5936             :         {
    5937         222 :             LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    5938         222 :             LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    5939             :         }
    5940             :         else
    5941             :         {
    5942         202 :             LocalMinRecoveryPoint = InvalidXLogRecPtr;
    5943         202 :             LocalMinRecoveryPointTLI = 0;
    5944             :         }
    5945             : 
    5946             :         /* Check that the GUCs used to generate the WAL allow recovery */
    5947         424 :         CheckRequiredParameterValues();
    5948             : 
    5949             :         /*
    5950             :          * We're in recovery, so unlogged relations may be trashed and must be
    5951             :          * reset.  This should be done BEFORE allowing Hot Standby
    5952             :          * connections, so that read-only backends don't try to read whatever
    5953             :          * garbage is left over from before.
    5954             :          */
    5955         424 :         ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
    5956             : 
    5957             :         /*
    5958             :          * Likewise, delete any saved transaction snapshot files that got left
    5959             :          * behind by crashed backends.
    5960             :          */
    5961         424 :         DeleteAllExportedSnapshotFiles();
    5962             : 
    5963             :         /*
    5964             :          * Initialize for Hot Standby, if enabled. We won't let backends in
    5965             :          * yet, not until we've reached the min recovery point specified in
    5966             :          * control file and we've established a recovery snapshot from a
    5967             :          * running-xacts WAL record.
    5968             :          */
    5969         424 :         if (ArchiveRecoveryRequested && EnableHotStandby)
    5970             :         {
    5971             :             TransactionId *xids;
    5972             :             int         nxids;
    5973             : 
    5974         210 :             ereport(DEBUG1,
    5975             :                     (errmsg_internal("initializing for hot standby")));
    5976             : 
    5977         210 :             InitRecoveryTransactionEnvironment();
    5978             : 
    5979         210 :             if (wasShutdown)
    5980          52 :                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    5981             :             else
    5982         158 :                 oldestActiveXID = checkPoint.oldestActiveXid;
    5983             :             Assert(TransactionIdIsValid(oldestActiveXID));
    5984             : 
    5985             :             /* Tell procarray about the range of xids it has to deal with */
    5986         210 :             ProcArrayInitRecovery(XidFromFullTransactionId(TransamVariables->nextXid));
    5987             : 
    5988             :             /*
    5989             :              * Startup subtrans only.  CLOG, MultiXact and commit timestamp
    5990             :              * have already been started up and other SLRUs are not maintained
    5991             :              * during recovery and need not be started yet.
    5992             :              */
    5993         210 :             StartupSUBTRANS(oldestActiveXID);
    5994             : 
    5995             :             /*
    5996             :              * If we're beginning at a shutdown checkpoint, we know that
    5997             :              * nothing was running on the primary at this point. So fake-up an
    5998             :              * empty running-xacts record and use that here and now. Recover
    5999             :              * additional standby state for prepared transactions.
    6000             :              */
    6001         210 :             if (wasShutdown)
    6002             :             {
    6003             :                 RunningTransactionsData running;
    6004             :                 TransactionId latestCompletedXid;
    6005             : 
    6006             :                 /* Update pg_subtrans entries for any prepared transactions */
    6007          52 :                 StandbyRecoverPreparedTransactions();
    6008             : 
    6009             :                 /*
    6010             :                  * Construct a RunningTransactions snapshot representing a
    6011             :                  * shut down server, with only prepared transactions still
    6012             :                  * alive. We're never overflowed at this point because all
    6013             :                  * subxids are listed with their parent prepared transactions.
    6014             :                  */
    6015          52 :                 running.xcnt = nxids;
    6016          52 :                 running.subxcnt = 0;
    6017          52 :                 running.subxid_status = SUBXIDS_IN_SUBTRANS;
    6018          52 :                 running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
    6019          52 :                 running.oldestRunningXid = oldestActiveXID;
    6020          52 :                 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
    6021          52 :                 TransactionIdRetreat(latestCompletedXid);
    6022             :                 Assert(TransactionIdIsNormal(latestCompletedXid));
    6023          52 :                 running.latestCompletedXid = latestCompletedXid;
    6024          52 :                 running.xids = xids;
    6025             : 
    6026          52 :                 ProcArrayApplyRecoveryInfo(&running);
    6027             :             }
    6028             :         }
    6029             : 
    6030             :         /*
    6031             :          * We're all set for replaying the WAL now. Do it.
    6032             :          */
    6033         424 :         PerformWalRecovery();
    6034         308 :         performedWalRecovery = true;
    6035             :     }
    6036             :     else
    6037        1420 :         performedWalRecovery = false;
    6038             : 
    6039             :     /*
    6040             :      * Finish WAL recovery.
    6041             :      */
    6042        1728 :     endOfRecoveryInfo = FinishWalRecovery();
    6043        1728 :     EndOfLog = endOfRecoveryInfo->endOfLog;
    6044        1728 :     EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
    6045        1728 :     abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
    6046        1728 :     missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
    6047             : 
    6048             :     /*
    6049             :      * Reset ps status display, so as no information related to recovery shows
    6050             :      * up.
    6051             :      */
    6052        1728 :     set_ps_display("");
    6053             : 
    6054             :     /*
    6055             :      * When recovering from a backup (we are in recovery, and archive recovery
    6056             :      * was requested), complain if we did not roll forward far enough to reach
    6057             :      * the point where the database is consistent.  For regular online
    6058             :      * backup-from-primary, that means reaching the end-of-backup WAL record
    6059             :      * (at which point we reset backupStartPoint to be Invalid), for
    6060             :      * backup-from-replica (which can't inject records into the WAL stream),
    6061             :      * that point is when we reach the minRecoveryPoint in pg_control (which
    6062             :      * we purposefully copy last when backing up from a replica).  For
    6063             :      * pg_rewind (which creates a backup_label with a method of "pg_rewind")
    6064             :      * or snapshot-style backups (which don't), backupEndRequired will be set
    6065             :      * to false.
    6066             :      *
    6067             :      * Note: it is indeed okay to look at the local variable
    6068             :      * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
    6069             :      * might be further ahead --- ControlFile->minRecoveryPoint cannot have
    6070             :      * been advanced beyond the WAL we processed.
    6071             :      */
    6072        1728 :     if (InRecovery &&
    6073         308 :         (EndOfLog < LocalMinRecoveryPoint ||
    6074         308 :          !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
    6075             :     {
    6076             :         /*
    6077             :          * Ran off end of WAL before reaching end-of-backup WAL record, or
    6078             :          * minRecoveryPoint. That's a bad sign, indicating that you tried to
    6079             :          * recover from an online backup but never called pg_backup_stop(), or
    6080             :          * you didn't archive all the WAL needed.
    6081             :          */
    6082           0 :         if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
    6083             :         {
    6084           0 :             if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired)
    6085           0 :                 ereport(FATAL,
    6086             :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6087             :                          errmsg("WAL ends before end of online backup"),
    6088             :                          errhint("All WAL generated while online backup was taken must be available at recovery.")));
    6089             :             else
    6090           0 :                 ereport(FATAL,
    6091             :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6092             :                          errmsg("WAL ends before consistent recovery point")));
    6093             :         }
    6094             :     }
    6095             : 
    6096             :     /*
    6097             :      * Reset unlogged relations to the contents of their INIT fork. This is
    6098             :      * done AFTER recovery is complete so as to include any unlogged relations
    6099             :      * created during recovery, but BEFORE recovery is marked as having
    6100             :      * completed successfully. Otherwise we'd not retry if any of the post
    6101             :      * end-of-recovery steps fail.
    6102             :      */
    6103        1728 :     if (InRecovery)
    6104         308 :         ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
    6105             : 
    6106             :     /*
    6107             :      * Pre-scan prepared transactions to find out the range of XIDs present.
    6108             :      * This information is not quite needed yet, but it is positioned here so
    6109             :      * as potential problems are detected before any on-disk change is done.
    6110             :      */
    6111        1728 :     oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
    6112             : 
    6113             :     /*
    6114             :      * Allow ordinary WAL segment creation before possibly switching to a new
    6115             :      * timeline, which creates a new segment, and after the last ReadRecord().
    6116             :      */
    6117        1728 :     SetInstallXLogFileSegmentActive();
    6118             : 
    6119             :     /*
    6120             :      * Consider whether we need to assign a new timeline ID.
    6121             :      *
    6122             :      * If we did archive recovery, we always assign a new ID.  This handles a
    6123             :      * couple of issues.  If we stopped short of the end of WAL during
    6124             :      * recovery, then we are clearly generating a new timeline and must assign
    6125             :      * it a unique new ID.  Even if we ran to the end, modifying the current
    6126             :      * last segment is problematic because it may result in trying to
    6127             :      * overwrite an already-archived copy of that segment, and we encourage
    6128             :      * DBAs to make their archive_commands reject that.  We can dodge the
    6129             :      * problem by making the new active segment have a new timeline ID.
    6130             :      *
    6131             :      * In a normal crash recovery, we can just extend the timeline we were in.
    6132             :      */
    6133        1728 :     newTLI = endOfRecoveryInfo->lastRecTLI;
    6134        1728 :     if (ArchiveRecoveryRequested)
    6135             :     {
    6136          98 :         newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
    6137          98 :         ereport(LOG,
    6138             :                 (errmsg("selected new timeline ID: %u", newTLI)));
    6139             : 
    6140             :         /*
    6141             :          * Make a writable copy of the last WAL segment.  (Note that we also
    6142             :          * have a copy of the last block of the old WAL in
    6143             :          * endOfRecovery->lastPage; we will use that below.)
    6144             :          */
    6145          98 :         XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
    6146             : 
    6147             :         /*
    6148             :          * Remove the signal files out of the way, so that we don't
    6149             :          * accidentally re-enter archive recovery mode in a subsequent crash.
    6150             :          */
    6151          98 :         if (endOfRecoveryInfo->standby_signal_file_found)
    6152          92 :             durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
    6153             : 
    6154          98 :         if (endOfRecoveryInfo->recovery_signal_file_found)
    6155           6 :             durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
    6156             : 
    6157             :         /*
    6158             :          * Write the timeline history file, and have it archived. After this
    6159             :          * point (or rather, as soon as the file is archived), the timeline
    6160             :          * will appear as "taken" in the WAL archive and to any standby
    6161             :          * servers.  If we crash before actually switching to the new
    6162             :          * timeline, standby servers will nevertheless think that we switched
    6163             :          * to the new timeline, and will try to connect to the new timeline.
    6164             :          * To minimize the window for that, try to do as little as possible
    6165             :          * between here and writing the end-of-recovery record.
    6166             :          */
    6167          98 :         writeTimeLineHistory(newTLI, recoveryTargetTLI,
    6168             :                              EndOfLog, endOfRecoveryInfo->recoveryStopReason);
    6169             : 
    6170          98 :         ereport(LOG,
    6171             :                 (errmsg("archive recovery complete")));
    6172             :     }
    6173             : 
    6174             :     /* Save the selected TimeLineID in shared memory, too */
    6175        1728 :     SpinLockAcquire(&XLogCtl->info_lck);
    6176        1728 :     XLogCtl->InsertTimeLineID = newTLI;
    6177        1728 :     XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
    6178        1728 :     SpinLockRelease(&XLogCtl->info_lck);
    6179             : 
    6180             :     /*
    6181             :      * Actually, if WAL ended in an incomplete record, skip the parts that
    6182             :      * made it through and start writing after the portion that persisted.
    6183             :      * (It's critical to first write an OVERWRITE_CONTRECORD message, which
    6184             :      * we'll do as soon as we're open for writing new WAL.)
    6185             :      */
    6186        1728 :     if (!XLogRecPtrIsInvalid(missingContrecPtr))
    6187             :     {
    6188             :         /*
    6189             :          * We should only have a missingContrecPtr if we're not switching to a
    6190             :          * new timeline. When a timeline switch occurs, WAL is copied from the
    6191             :          * old timeline to the new only up to the end of the last complete
    6192             :          * record, so there can't be an incomplete WAL record that we need to
    6193             :          * disregard.
    6194             :          */
    6195             :         Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
    6196             :         Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
    6197          20 :         EndOfLog = missingContrecPtr;
    6198             :     }
    6199             : 
    6200             :     /*
    6201             :      * Prepare to write WAL starting at EndOfLog location, and init xlog
    6202             :      * buffer cache using the block containing the last record from the
    6203             :      * previous incarnation.
    6204             :      */
    6205        1728 :     Insert = &XLogCtl->Insert;
    6206        1728 :     Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
    6207        1728 :     Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
    6208             : 
    6209             :     /*
    6210             :      * Tricky point here: lastPage contains the *last* block that the LastRec
    6211             :      * record spans, not the one it starts in.  The last block is indeed the
    6212             :      * one we want to use.
    6213             :      */
    6214        1728 :     if (EndOfLog % XLOG_BLCKSZ != 0)
    6215             :     {
    6216             :         char       *page;
    6217             :         int         len;
    6218             :         int         firstIdx;
    6219             : 
    6220        1676 :         firstIdx = XLogRecPtrToBufIdx(EndOfLog);
    6221        1676 :         len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
    6222             :         Assert(len < XLOG_BLCKSZ);
    6223             : 
    6224             :         /* Copy the valid part of the last block, and zero the rest */
    6225        1676 :         page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
    6226        1676 :         memcpy(page, endOfRecoveryInfo->lastPage, len);
    6227        1676 :         memset(page + len, 0, XLOG_BLCKSZ - len);
    6228             : 
    6229        1676 :         pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
    6230        1676 :         pg_atomic_write_u64(&XLogCtl->InitializedUpTo, endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
    6231        1676 :         XLogCtl->InitializedFrom = endOfRecoveryInfo->lastPageBeginPtr;
    6232             :     }
    6233             :     else
    6234             :     {
    6235             :         /*
    6236             :          * There is no partial block to copy. Just set InitializedUpTo, and
    6237             :          * let the first attempt to insert a log record to initialize the next
    6238             :          * buffer.
    6239             :          */
    6240          52 :         pg_atomic_write_u64(&XLogCtl->InitializedUpTo, EndOfLog);
    6241          52 :         XLogCtl->InitializedFrom = EndOfLog;
    6242             :     }
    6243        1728 :     pg_atomic_write_u64(&XLogCtl->InitializeReserved, pg_atomic_read_u64(&XLogCtl->InitializedUpTo));
    6244             : 
    6245             :     /*
    6246             :      * Update local and shared status.  This is OK to do without any locks
    6247             :      * because no other process can be reading or writing WAL yet.
    6248             :      */
    6249        1728 :     LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
    6250        1728 :     pg_atomic_write_u64(&XLogCtl->logInsertResult, EndOfLog);
    6251        1728 :     pg_atomic_write_u64(&XLogCtl->logWriteResult, EndOfLog);
    6252        1728 :     pg_atomic_write_u64(&XLogCtl->logFlushResult, EndOfLog);
    6253        1728 :     XLogCtl->LogwrtRqst.Write = EndOfLog;
    6254        1728 :     XLogCtl->LogwrtRqst.Flush = EndOfLog;
    6255             : 
    6256             :     /*
    6257             :      * Preallocate additional log files, if wanted.
    6258             :      */
    6259        1728 :     PreallocXlogFiles(EndOfLog, newTLI);
    6260             : 
    6261             :     /*
    6262             :      * Okay, we're officially UP.
    6263             :      */
    6264        1728 :     InRecovery = false;
    6265             : 
    6266             :     /* start the archive_timeout timer and LSN running */
    6267        1728 :     XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    6268        1728 :     XLogCtl->lastSegSwitchLSN = EndOfLog;
    6269             : 
    6270             :     /* also initialize latestCompletedXid, to nextXid - 1 */
    6271        1728 :     LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
    6272        1728 :     TransamVariables->latestCompletedXid = TransamVariables->nextXid;
    6273        1728 :     FullTransactionIdRetreat(&TransamVariables->latestCompletedXid);
    6274        1728 :     LWLockRelease(ProcArrayLock);
    6275             : 
    6276             :     /*
    6277             :      * Start up subtrans, if not already done for hot standby.  (commit
    6278             :      * timestamps are started below, if necessary.)
    6279             :      */
    6280        1728 :     if (standbyState == STANDBY_DISABLED)
    6281        1630 :         StartupSUBTRANS(oldestActiveXID);
    6282             : 
    6283             :     /*
    6284             :      * Perform end of recovery actions for any SLRUs that need it.
    6285             :      */
    6286        1728 :     TrimCLOG();
    6287        1728 :     TrimMultiXact();
    6288             : 
    6289             :     /*
    6290             :      * Reload shared-memory state for prepared transactions.  This needs to
    6291             :      * happen before renaming the last partial segment of the old timeline as
    6292             :      * it may be possible that we have to recover some transactions from it.
    6293             :      */
    6294        1728 :     RecoverPreparedTransactions();
    6295             : 
    6296             :     /* Shut down xlogreader */
    6297        1728 :     ShutdownWalRecovery();
    6298             : 
    6299             :     /* Enable WAL writes for this backend only. */
    6300        1728 :     LocalSetXLogInsertAllowed();
    6301             : 
    6302             :     /* If necessary, write overwrite-contrecord before doing anything else */
    6303        1728 :     if (!XLogRecPtrIsInvalid(abortedRecPtr))
    6304             :     {
    6305             :         Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
    6306          20 :         CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
    6307             :     }
    6308             : 
    6309             :     /*
    6310             :      * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
    6311             :      * record before resource manager writes cleanup WAL records or checkpoint
    6312             :      * record is written.
    6313             :      */
    6314        1728 :     Insert->fullPageWrites = lastFullPageWrites;
    6315        1728 :     UpdateFullPageWrites();
    6316             : 
    6317             :     /*
    6318             :      * Emit checkpoint or end-of-recovery record in XLOG, if required.
    6319             :      */
    6320        1728 :     if (performedWalRecovery)
    6321         308 :         promoted = PerformRecoveryXLogAction();
    6322             : 
    6323             :     /*
    6324             :      * If any of the critical GUCs have changed, log them before we allow
    6325             :      * backends to write WAL.
    6326             :      */
    6327        1728 :     XLogReportParameters();
    6328             : 
    6329             :     /* If this is archive recovery, perform post-recovery cleanup actions. */
    6330        1728 :     if (ArchiveRecoveryRequested)
    6331          98 :         CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
    6332             : 
    6333             :     /*
    6334             :      * Local WAL inserts enabled, so it's time to finish initialization of
    6335             :      * commit timestamp.
    6336             :      */
    6337        1728 :     CompleteCommitTsInitialization();
    6338             : 
    6339             :     /*
    6340             :      * All done with end-of-recovery actions.
    6341             :      *
    6342             :      * Now allow backends to write WAL and update the control file status in
    6343             :      * consequence.  SharedRecoveryState, that controls if backends can write
    6344             :      * WAL, is updated while holding ControlFileLock to prevent other backends
    6345             :      * to look at an inconsistent state of the control file in shared memory.
    6346             :      * There is still a small window during which backends can write WAL and
    6347             :      * the control file is still referring to a system not in DB_IN_PRODUCTION
    6348             :      * state while looking at the on-disk control file.
    6349             :      *
    6350             :      * Also, we use info_lck to update SharedRecoveryState to ensure that
    6351             :      * there are no race conditions concerning visibility of other recent
    6352             :      * updates to shared memory.
    6353             :      */
    6354        1728 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6355        1728 :     ControlFile->state = DB_IN_PRODUCTION;
    6356             : 
    6357        1728 :     SpinLockAcquire(&XLogCtl->info_lck);
    6358        1728 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
    6359        1728 :     SpinLockRelease(&XLogCtl->info_lck);
    6360             : 
    6361        1728 :     UpdateControlFile();
    6362        1728 :     LWLockRelease(ControlFileLock);
    6363             : 
    6364             :     /*
    6365             :      * Shutdown the recovery environment.  This must occur after
    6366             :      * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
    6367             :      * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
    6368             :      * any session building a snapshot will not rely on KnownAssignedXids as
    6369             :      * RecoveryInProgress() would return false at this stage.  This is
    6370             :      * particularly critical for prepared 2PC transactions, that would still
    6371             :      * need to be included in snapshots once recovery has ended.
    6372             :      */
    6373        1728 :     if (standbyState != STANDBY_DISABLED)
    6374          98 :         ShutdownRecoveryTransactionEnvironment();
    6375             : 
    6376             :     /*
    6377             :      * If there were cascading standby servers connected to us, nudge any wal
    6378             :      * sender processes to notice that we've been promoted.
    6379             :      */
    6380        1728 :     WalSndWakeup(true, true);
    6381             : 
    6382             :     /*
    6383             :      * If this was a promotion, request an (online) checkpoint now. This isn't
    6384             :      * required for consistency, but the last restartpoint might be far back,
    6385             :      * and in case of a crash, recovering from it might take a longer than is
    6386             :      * appropriate now that we're not in standby mode anymore.
    6387             :      */
    6388        1728 :     if (promoted)
    6389          84 :         RequestCheckpoint(CHECKPOINT_FORCE);
    6390        1728 : }
    6391             : 
    6392             : /*
    6393             :  * Callback from PerformWalRecovery(), called when we switch from crash
    6394             :  * recovery to archive recovery mode.  Updates the control file accordingly.
    6395             :  */
    6396             : void
    6397           4 : SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
    6398             : {
    6399             :     /* initialize minRecoveryPoint to this record */
    6400           4 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6401           4 :     ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
    6402           4 :     if (ControlFile->minRecoveryPoint < EndRecPtr)
    6403             :     {
    6404           4 :         ControlFile->minRecoveryPoint = EndRecPtr;
    6405           4 :         ControlFile->minRecoveryPointTLI = replayTLI;
    6406             :     }
    6407             :     /* update local copy */
    6408           4 :     LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    6409           4 :     LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    6410             : 
    6411             :     /*
    6412             :      * The startup process can update its local copy of minRecoveryPoint from
    6413             :      * this point.
    6414             :      */
    6415           4 :     updateMinRecoveryPoint = true;
    6416             : 
    6417           4 :     UpdateControlFile();
    6418             : 
    6419             :     /*
    6420             :      * We update SharedRecoveryState while holding the lock on ControlFileLock
    6421             :      * so both states are consistent in shared memory.
    6422             :      */
    6423           4 :     SpinLockAcquire(&XLogCtl->info_lck);
    6424           4 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
    6425           4 :     SpinLockRelease(&XLogCtl->info_lck);
    6426             : 
    6427           4 :     LWLockRelease(ControlFileLock);
    6428           4 : }
    6429             : 
    6430             : /*
    6431             :  * Callback from PerformWalRecovery(), called when we reach the end of backup.
    6432             :  * Updates the control file accordingly.
    6433             :  */
    6434             : void
    6435         142 : ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
    6436             : {
    6437             :     /*
    6438             :      * We have reached the end of base backup, as indicated by pg_control. The
    6439             :      * data on disk is now consistent (unless minRecoveryPoint is further
    6440             :      * ahead, which can happen if we crashed during previous recovery).  Reset
    6441             :      * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
    6442             :      * make sure we don't allow starting up at an earlier point even if
    6443             :      * recovery is stopped and restarted soon after this.
    6444             :      */
    6445         142 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6446             : 
    6447         142 :     if (ControlFile->minRecoveryPoint < EndRecPtr)
    6448             :     {
    6449         134 :         ControlFile->minRecoveryPoint = EndRecPtr;
    6450         134 :         ControlFile->minRecoveryPointTLI = tli;
    6451             :     }
    6452             : 
    6453         142 :     ControlFile->backupStartPoint = InvalidXLogRecPtr;
    6454         142 :     ControlFile->backupEndPoint = InvalidXLogRecPtr;
    6455         142 :     ControlFile->backupEndRequired = false;
    6456         142 :     UpdateControlFile();
    6457             : 
    6458         142 :     LWLockRelease(ControlFileLock);
    6459         142 : }
    6460             : 
    6461             : /*
    6462             :  * Perform whatever XLOG actions are necessary at end of REDO.
    6463             :  *
    6464             :  * The goal here is to make sure that we'll be able to recover properly if
    6465             :  * we crash again. If we choose to write a checkpoint, we'll write a shutdown
    6466             :  * checkpoint rather than an on-line one. This is not particularly critical,
    6467             :  * but since we may be assigning a new TLI, using a shutdown checkpoint allows
    6468             :  * us to have the rule that TLI only changes in shutdown checkpoints, which
    6469             :  * allows some extra error checking in xlog_redo.
    6470             :  */
    6471             : static bool
    6472         308 : PerformRecoveryXLogAction(void)
    6473             : {
    6474         308 :     bool        promoted = false;
    6475             : 
    6476             :     /*
    6477             :      * Perform a checkpoint to update all our recovery activity to disk.
    6478             :      *
    6479             :      * Note that we write a shutdown checkpoint rather than an on-line one.
    6480             :      * This is not particularly critical, but since we may be assigning a new
    6481             :      * TLI, using a shutdown checkpoint allows us to have the rule that TLI
    6482             :      * only changes in shutdown checkpoints, which allows some extra error
    6483             :      * checking in xlog_redo.
    6484             :      *
    6485             :      * In promotion, only create a lightweight end-of-recovery record instead
    6486             :      * of a full checkpoint. A checkpoint is requested later, after we're
    6487             :      * fully out of recovery mode and already accepting queries.
    6488             :      */
    6489         406 :     if (ArchiveRecoveryRequested && IsUnderPostmaster &&
    6490          98 :         PromoteIsTriggered())
    6491             :     {
    6492          84 :         promoted = true;
    6493             : 
    6494             :         /*
    6495             :          * Insert a special WAL record to mark the end of recovery, since we
    6496             :          * aren't doing a checkpoint. That means that the checkpointer process
    6497             :          * may likely be in the middle of a time-smoothed restartpoint and
    6498             :          * could continue to be for minutes after this.  That sounds strange,
    6499             :          * but the effect is roughly the same and it would be stranger to try
    6500             :          * to come out of the restartpoint and then checkpoint. We request a
    6501             :          * checkpoint later anyway, just for safety.
    6502             :          */
    6503          84 :         CreateEndOfRecoveryRecord();
    6504             :     }
    6505             :     else
    6506             :     {
    6507         224 :         RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
    6508             :                           CHECKPOINT_IMMEDIATE |
    6509             :                           CHECKPOINT_WAIT);
    6510             :     }
    6511             : 
    6512         308 :     return promoted;
    6513             : }
    6514             : 
    6515             : /*
    6516             :  * Is the system still in recovery?
    6517             :  *
    6518             :  * Unlike testing InRecovery, this works in any process that's connected to
    6519             :  * shared memory.
    6520             :  */
    6521             : bool
    6522   179263548 : RecoveryInProgress(void)
    6523             : {
    6524             :     /*
    6525             :      * We check shared state each time only until we leave recovery mode. We
    6526             :      * can't re-enter recovery, so there's no need to keep checking after the
    6527             :      * shared variable has once been seen false.
    6528             :      */
    6529   179263548 :     if (!LocalRecoveryInProgress)
    6530   174862286 :         return false;
    6531             :     else
    6532             :     {
    6533             :         /*
    6534             :          * use volatile pointer to make sure we make a fresh read of the
    6535             :          * shared variable.
    6536             :          */
    6537     4401262 :         volatile XLogCtlData *xlogctl = XLogCtl;
    6538             : 
    6539     4401262 :         LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
    6540             : 
    6541             :         /*
    6542             :          * Note: We don't need a memory barrier when we're still in recovery.
    6543             :          * We might exit recovery immediately after return, so the caller
    6544             :          * can't rely on 'true' meaning that we're still in recovery anyway.
    6545             :          */
    6546             : 
    6547     4401262 :         return LocalRecoveryInProgress;
    6548             :     }
    6549             : }
    6550             : 
    6551             : /*
    6552             :  * Returns current recovery state from shared memory.
    6553             :  *
    6554             :  * This returned state is kept consistent with the contents of the control
    6555             :  * file.  See details about the possible values of RecoveryState in xlog.h.
    6556             :  */
    6557             : RecoveryState
    6558       27076 : GetRecoveryState(void)
    6559             : {
    6560             :     RecoveryState retval;
    6561             : 
    6562       27076 :     SpinLockAcquire(&XLogCtl->info_lck);
    6563       27076 :     retval = XLogCtl->SharedRecoveryState;
    6564       27076 :     SpinLockRelease(&XLogCtl->info_lck);
    6565             : 
    6566       27076 :     return retval;
    6567             : }
    6568             : 
    6569             : /*
    6570             :  * Is this process allowed to insert new WAL records?
    6571             :  *
    6572             :  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
    6573             :  * But we also have provisions for forcing the result "true" or "false"
    6574             :  * within specific processes regardless of the global state.
    6575             :  */
    6576             : bool
    6577    61131096 : XLogInsertAllowed(void)
    6578             : {
    6579             :     /*
    6580             :      * If value is "unconditionally true" or "unconditionally false", just
    6581             :      * return it.  This provides the normal fast path once recovery is known
    6582             :      * done.
    6583             :      */
    6584    61131096 :     if (LocalXLogInsertAllowed >= 0)
    6585    60903142 :         return (bool) LocalXLogInsertAllowed;
    6586             : 
    6587             :     /*
    6588             :      * Else, must check to see if we're still in recovery.
    6589             :      */
    6590      227954 :     if (RecoveryInProgress())
    6591      211972 :         return false;
    6592             : 
    6593             :     /*
    6594             :      * On exit from recovery, reset to "unconditionally true", since there is
    6595             :      * no need to keep checking.
    6596             :      */
    6597       15982 :     LocalXLogInsertAllowed = 1;
    6598       15982 :     return true;
    6599             : }
    6600             : 
    6601             : /*
    6602             :  * Make XLogInsertAllowed() return true in the current process only.
    6603             :  *
    6604             :  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
    6605             :  * and even call LocalSetXLogInsertAllowed() again after that.
    6606             :  *
    6607             :  * Returns the previous value of LocalXLogInsertAllowed.
    6608             :  */
    6609             : static int
    6610        1786 : LocalSetXLogInsertAllowed(void)
    6611             : {
    6612        1786 :     int         oldXLogAllowed = LocalXLogInsertAllowed;
    6613             : 
    6614        1786 :     LocalXLogInsertAllowed = 1;
    6615             : 
    6616        1786 :     return oldXLogAllowed;
    6617             : }
    6618             : 
    6619             : /*
    6620             :  * Return the current Redo pointer from shared memory.
    6621             :  *
    6622             :  * As a side-effect, the local RedoRecPtr copy is updated.
    6623             :  */
    6624             : XLogRecPtr
    6625      593468 : GetRedoRecPtr(void)
    6626             : {
    6627             :     XLogRecPtr  ptr;
    6628             : 
    6629             :     /*
    6630             :      * The possibly not up-to-date copy in XlogCtl is enough. Even if we
    6631             :      * grabbed a WAL insertion lock to read the authoritative value in
    6632             :      * Insert->RedoRecPtr, someone might update it just after we've released
    6633             :      * the lock.
    6634             :      */
    6635      593468 :     SpinLockAcquire(&XLogCtl->info_lck);
    6636      593468 :     ptr = XLogCtl->RedoRecPtr;
    6637      593468 :     SpinLockRelease(&XLogCtl->info_lck);
    6638             : 
    6639      593468 :     if (RedoRecPtr < ptr)
    6640        2910 :         RedoRecPtr = ptr;
    6641             : 
    6642      593468 :     return RedoRecPtr;
    6643             : }
    6644             : 
    6645             : /*
    6646             :  * Return information needed to decide whether a modified block needs a
    6647             :  * full-page image to be included in the WAL record.
    6648             :  *
    6649             :  * The returned values are cached copies from backend-private memory, and
    6650             :  * possibly out-of-date or, indeed, uninitialized, in which case they will
    6651             :  * be InvalidXLogRecPtr and false, respectively.  XLogInsertRecord will
    6652             :  * re-check them against up-to-date values, while holding the WAL insert lock.
    6653             :  */
    6654             : void
    6655    29536798 : GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
    6656             : {
    6657    29536798 :     *RedoRecPtr_p = RedoRecPtr;
    6658    29536798 :     *doPageWrites_p = doPageWrites;
    6659    29536798 : }
    6660             : 
    6661             : /*
    6662             :  * GetInsertRecPtr -- Returns the current insert position.
    6663             :  *
    6664             :  * NOTE: The value *actually* returned is the position of the last full
    6665             :  * xlog page. It lags behind the real insert position by at most 1 page.
    6666             :  * For that, we don't need to scan through WAL insertion locks, and an
    6667             :  * approximation is enough for the current usage of this function.
    6668             :  */
    6669             : XLogRecPtr
    6670       14320 : GetInsertRecPtr(void)
    6671             : {
    6672             :     XLogRecPtr  recptr;
    6673             : 
    6674       14320 :     SpinLockAcquire(&XLogCtl->info_lck);
    6675       14320 :     recptr = XLogCtl->LogwrtRqst.Write;
    6676       14320 :     SpinLockRelease(&XLogCtl->info_lck);
    6677             : 
    6678       14320 :     return recptr;
    6679             : }
    6680             : 
    6681             : /*
    6682             :  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
    6683             :  * position known to be fsync'd to disk. This should only be used on a
    6684             :  * system that is known not to be in recovery.
    6685             :  */
    6686             : XLogRecPtr
    6687      476990 : GetFlushRecPtr(TimeLineID *insertTLI)
    6688             : {
    6689             :     Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
    6690             : 
    6691      476990 :     RefreshXLogWriteResult(LogwrtResult);
    6692             : 
    6693             :     /*
    6694             :      * If we're writing and flushing WAL, the time line can't be changing, so
    6695             :      * no lock is required.
    6696             :      */
    6697      476990 :     if (insertTLI)
    6698       46530 :         *insertTLI = XLogCtl->InsertTimeLineID;
    6699             : 
    6700      476990 :     return LogwrtResult.Flush;
    6701             : }
    6702             : 
    6703             : /*
    6704             :  * GetWALInsertionTimeLine -- Returns the current timeline of a system that
    6705             :  * is not in recovery.
    6706             :  */
    6707             : TimeLineID
    6708      219232 : GetWALInsertionTimeLine(void)
    6709             : {
    6710             :     Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
    6711             : 
    6712             :     /* Since the value can't be changing, no lock is required. */
    6713      219232 :     return XLogCtl->InsertTimeLineID;
    6714             : }
    6715             : 
    6716             : /*
    6717             :  * GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns
    6718             :  * the WAL insertion timeline; else, returns 0. Wherever possible, use
    6719             :  * GetWALInsertionTimeLine() instead, since it's cheaper. Note that this
    6720             :  * function decides recovery has ended as soon as the insert TLI is set, which
    6721             :  * happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE.
    6722             :  */
    6723             : TimeLineID
    6724           0 : GetWALInsertionTimeLineIfSet(void)
    6725             : {
    6726             :     TimeLineID  insertTLI;
    6727             : 
    6728           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    6729           0 :     insertTLI = XLogCtl->InsertTimeLineID;
    6730           0 :     SpinLockRelease(&XLogCtl->info_lck);
    6731             : 
    6732           0 :     return insertTLI;
    6733             : }
    6734             : 
    6735             : /*
    6736             :  * GetLastImportantRecPtr -- Returns the LSN of the last important record
    6737             :  * inserted. All records not explicitly marked as unimportant are considered
    6738             :  * important.
    6739             :  *
    6740             :  * The LSN is determined by computing the maximum of
    6741             :  * WALInsertLocks[i].lastImportantAt.
    6742             :  */
    6743             : XLogRecPtr
    6744        3080 : GetLastImportantRecPtr(void)
    6745             : {
    6746        3080 :     XLogRecPtr  res = InvalidXLogRecPtr;
    6747             :     int         i;
    6748             : 
    6749       27720 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    6750             :     {
    6751             :         XLogRecPtr  last_important;
    6752             : 
    6753             :         /*
    6754             :          * Need to take a lock to prevent torn reads of the LSN, which are
    6755             :          * possible on some of the supported platforms. WAL insert locks only
    6756             :          * support exclusive mode, so we have to use that.
    6757             :          */
    6758       24640 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    6759       24640 :         last_important = WALInsertLocks[i].l.lastImportantAt;
    6760       24640 :         LWLockRelease(&WALInsertLocks[i].l.lock);
    6761             : 
    6762       24640 :         if (res < last_important)
    6763        5388 :             res = last_important;
    6764             :     }
    6765             : 
    6766        3080 :     return res;
    6767             : }
    6768             : 
    6769             : /*
    6770             :  * Get the time and LSN of the last xlog segment switch
    6771             :  */
    6772             : pg_time_t
    6773           0 : GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
    6774             : {
    6775             :     pg_time_t   result;
    6776             : 
    6777             :     /* Need WALWriteLock, but shared lock is sufficient */
    6778           0 :     LWLockAcquire(WALWriteLock, LW_SHARED);
    6779           0 :     result = XLogCtl->lastSegSwitchTime;
    6780           0 :     *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
    6781           0 :     LWLockRelease(WALWriteLock);
    6782             : 
    6783           0 :     return result;
    6784             : }
    6785             : 
    6786             : /*
    6787             :  * This must be called ONCE during postmaster or standalone-backend shutdown
    6788             :  */
    6789             : void
    6790        1238 : ShutdownXLOG(int code, Datum arg)
    6791             : {
    6792             :     /*
    6793             :      * We should have an aux process resource owner to use, and we should not
    6794             :      * be in a transaction that's installed some other resowner.
    6795             :      */
    6796             :     Assert(AuxProcessResourceOwner != NULL);
    6797             :     Assert(CurrentResourceOwner == NULL ||
    6798             :            CurrentResourceOwner == AuxProcessResourceOwner);
    6799        1238 :     CurrentResourceOwner = AuxProcessResourceOwner;
    6800             : 
    6801             :     /* Don't be chatty in standalone mode */
    6802        1238 :     ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    6803             :             (errmsg("shutting down")));
    6804             : 
    6805             :     /*
    6806             :      * Signal walsenders to move to stopping state.
    6807             :      */
    6808        1238 :     WalSndInitStopping();
    6809             : 
    6810             :     /*
    6811             :      * Wait for WAL senders to be in stopping state.  This prevents commands
    6812             :      * from writing new WAL.
    6813             :      */
    6814        1238 :     WalSndWaitStopping();
    6815             : 
    6816        1238 :     if (RecoveryInProgress())
    6817         110 :         CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    6818             :     else
    6819             :     {
    6820             :         /*
    6821             :          * If archiving is enabled, rotate the last XLOG file so that all the
    6822             :          * remaining records are archived (postmaster wakes up the archiver
    6823             :          * process one more time at the end of shutdown). The checkpoint
    6824             :          * record will go to the next XLOG file and won't be archived (yet).
    6825             :          */
    6826        1128 :         if (XLogArchivingActive())
    6827          28 :             RequestXLogSwitch(false);
    6828             : 
    6829        1128 :         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    6830             :     }
    6831        1238 : }
    6832             : 
    6833             : /*
    6834             :  * Log start of a checkpoint.
    6835             :  */
    6836             : static void
    6837        2772 : LogCheckpointStart(int flags, bool restartpoint)
    6838             : {
    6839        2772 :     if (restartpoint)
    6840         382 :         ereport(LOG,
    6841             :         /* translator: the placeholders show checkpoint options */
    6842             :                 (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
    6843             :                         (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
    6844             :                         (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
    6845             :                         (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
    6846             :                         (flags & CHECKPOINT_FORCE) ? " force" : "",
    6847             :                         (flags & CHECKPOINT_WAIT) ? " wait" : "",
    6848             :                         (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
    6849             :                         (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
    6850             :                         (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
    6851             :     else
    6852        2390 :         ereport(LOG,
    6853             :         /* translator: the placeholders show checkpoint options */
    6854             :                 (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
    6855             :                         (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
    6856             :                         (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
    6857             :                         (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
    6858             :                         (flags & CHECKPOINT_FORCE) ? " force" : "",
    6859             :                         (flags & CHECKPOINT_WAIT) ? " wait" : "",
    6860             :                         (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
    6861             :                         (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
    6862             :                         (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
    6863        2772 : }
    6864             : 
    6865             : /*
    6866             :  * Log end of a checkpoint.
    6867             :  */
    6868             : static void
    6869        3368 : LogCheckpointEnd(bool restartpoint)
    6870             : {
    6871             :     long        write_msecs,
    6872             :                 sync_msecs,
    6873             :                 total_msecs,
    6874             :                 longest_msecs,
    6875             :                 average_msecs;
    6876             :     uint64      average_sync_time;
    6877             : 
    6878        3368 :     CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
    6879             : 
    6880        3368 :     write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
    6881             :                                                   CheckpointStats.ckpt_sync_t);
    6882             : 
    6883        3368 :     sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
    6884             :                                                  CheckpointStats.ckpt_sync_end_t);
    6885             : 
    6886             :     /* Accumulate checkpoint timing summary data, in milliseconds. */
    6887        3368 :     PendingCheckpointerStats.write_time += write_msecs;
    6888        3368 :     PendingCheckpointerStats.sync_time += sync_msecs;
    6889             : 
    6890             :     /*
    6891             :      * All of the published timing statistics are accounted for.  Only
    6892             :      * continue if a log message is to be written.
    6893             :      */
    6894        3368 :     if (!log_checkpoints)
    6895         596 :         return;
    6896             : 
    6897        2772 :     total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
    6898             :                                                   CheckpointStats.ckpt_end_t);
    6899             : 
    6900             :     /*
    6901             :      * Timing values returned from CheckpointStats are in microseconds.
    6902             :      * Convert to milliseconds for consistent printing.
    6903             :      */
    6904        2772 :     longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
    6905             : 
    6906        2772 :     average_sync_time = 0;
    6907        2772 :     if (CheckpointStats.ckpt_sync_rels > 0)
    6908           0 :         average_sync_time = CheckpointStats.ckpt_agg_sync_time /
    6909           0 :             CheckpointStats.ckpt_sync_rels;
    6910        2772 :     average_msecs = (long) ((average_sync_time + 999) / 1000);
    6911             : 
    6912             :     /*
    6913             :      * ControlFileLock is not required to see ControlFile->checkPoint and
    6914             :      * ->checkPointCopy here as we are the only updator of those variables at
    6915             :      * this moment.
    6916             :      */
    6917        2772 :     if (restartpoint)
    6918         382 :         ereport(LOG,
    6919             :                 (errmsg("restartpoint complete: wrote %d buffers (%.1f%%), "
    6920             :                         "wrote %d SLRU buffers; %d WAL file(s) added, "
    6921             :                         "%d removed, %d recycled; write=%ld.%03d s, "
    6922             :                         "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
    6923             :                         "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
    6924             :                         "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X",
    6925             :                         CheckpointStats.ckpt_bufs_written,
    6926             :                         (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
    6927             :                         CheckpointStats.ckpt_slru_written,
    6928             :                         CheckpointStats.ckpt_segs_added,
    6929             :                         CheckpointStats.ckpt_segs_removed,
    6930             :                         CheckpointStats.ckpt_segs_recycled,
    6931             :                         write_msecs / 1000, (int) (write_msecs % 1000),
    6932             :                         sync_msecs / 1000, (int) (sync_msecs % 1000),
    6933             :                         total_msecs / 1000, (int) (total_msecs % 1000),
    6934             :                         CheckpointStats.ckpt_sync_rels,
    6935             :                         longest_msecs / 1000, (int) (longest_msecs % 1000),
    6936             :                         average_msecs / 1000, (int) (average_msecs % 1000),
    6937             :                         (int) (PrevCheckPointDistance / 1024.0),
    6938             :                         (int) (CheckPointDistanceEstimate / 1024.0),
    6939             :                         LSN_FORMAT_ARGS(ControlFile->checkPoint),
    6940             :                         LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
    6941             :     else
    6942        2390 :         ereport(LOG,
    6943             :                 (errmsg("checkpoint complete: wrote %d buffers (%.1f%%), "
    6944             :                         "wrote %d SLRU buffers; %d WAL file(s) added, "
    6945             :                         "%d removed, %d recycled; write=%ld.%03d s, "
    6946             :                         "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
    6947             :                         "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
    6948             :                         "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X",
    6949             :                         CheckpointStats.ckpt_bufs_written,
    6950             :                         (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
    6951             :                         CheckpointStats.ckpt_slru_written,
    6952             :                         CheckpointStats.ckpt_segs_added,
    6953             :                         CheckpointStats.ckpt_segs_removed,
    6954             :                         CheckpointStats.ckpt_segs_recycled,
    6955             :                         write_msecs / 1000, (int) (write_msecs % 1000),
    6956             :                         sync_msecs / 1000, (int) (sync_msecs % 1000),
    6957             :                         total_msecs / 1000, (int) (total_msecs % 1000),
    6958             :                         CheckpointStats.ckpt_sync_rels,
    6959             :                         longest_msecs / 1000, (int) (longest_msecs % 1000),
    6960             :                         average_msecs / 1000, (int) (average_msecs % 1000),
    6961             :                         (int) (PrevCheckPointDistance / 1024.0),
    6962             :                         (int) (CheckPointDistanceEstimate / 1024.0),
    6963             :                         LSN_FORMAT_ARGS(ControlFile->checkPoint),
    6964             :                         LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
    6965             : }
    6966             : 
    6967             : /*
    6968             :  * Update the estimate of distance between checkpoints.
    6969             :  *
    6970             :  * The estimate is used to calculate the number of WAL segments to keep
    6971             :  * preallocated, see XLOGfileslop().
    6972             :  */
    6973             : static void
    6974        3368 : UpdateCheckPointDistanceEstimate(uint64 nbytes)
    6975             : {
    6976             :     /*
    6977             :      * To estimate the number of segments consumed between checkpoints, keep a
    6978             :      * moving average of the amount of WAL generated in previous checkpoint
    6979             :      * cycles. However, if the load is bursty, with quiet periods and busy
    6980             :      * periods, we want to cater for the peak load. So instead of a plain
    6981             :      * moving average, let the average decline slowly if the previous cycle
    6982             :      * used less WAL than estimated, but bump it up immediately if it used
    6983             :      * more.
    6984             :      *
    6985             :      * When checkpoints are triggered by max_wal_size, this should converge to
    6986             :      * CheckpointSegments * wal_segment_size,
    6987             :      *
    6988             :      * Note: This doesn't pay any attention to what caused the checkpoint.
    6989             :      * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
    6990             :      * starting a base backup, are counted the same as those created
    6991             :      * automatically. The slow-decline will largely mask them out, if they are
    6992             :      * not frequent. If they are frequent, it seems reasonable to count them
    6993             :      * in as any others; if you issue a manual checkpoint every 5 minutes and
    6994             :      * never let a timed checkpoint happen, it makes sense to base the
    6995             :      * preallocation on that 5 minute interval rather than whatever
    6996             :      * checkpoint_timeout is set to.
    6997             :      */
    6998        3368 :     PrevCheckPointDistance = nbytes;
    6999        3368 :     if (CheckPointDistanceEstimate < nbytes)
    7000        1412 :         CheckPointDistanceEstimate = nbytes;
    7001             :     else
    7002        1956 :         CheckPointDistanceEstimate =
    7003        1956 :             (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
    7004        3368 : }
    7005             : 
    7006             : /*
    7007             :  * Update the ps display for a process running a checkpoint.  Note that
    7008             :  * this routine should not do any allocations so as it can be called
    7009             :  * from a critical section.
    7010             :  */
    7011             : static void
    7012        6736 : update_checkpoint_display(int flags, bool restartpoint, bool reset)
    7013             : {
    7014             :     /*
    7015             :      * The status is reported only for end-of-recovery and shutdown
    7016             :      * checkpoints or shutdown restartpoints.  Updating the ps display is
    7017             :      * useful in those situations as it may not be possible to rely on
    7018             :      * pg_stat_activity to see the status of the checkpointer or the startup
    7019             :      * process.
    7020             :      */
    7021        6736 :     if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
    7022        4280 :         return;
    7023             : 
    7024        2456 :     if (reset)
    7025        1228 :         set_ps_display("");
    7026             :     else
    7027             :     {
    7028             :         char        activitymsg[128];
    7029             : 
    7030        3684 :         snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
    7031        1228 :                  (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
    7032        1228 :                  (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
    7033             :                  restartpoint ? "restartpoint" : "checkpoint");
    7034        1228 :         set_ps_display(activitymsg);
    7035             :     }
    7036             : }
    7037             : 
    7038             : 
    7039             : /*
    7040             :  * Perform a checkpoint --- either during shutdown, or on-the-fly
    7041             :  *
    7042             :  * flags is a bitwise OR of the following:
    7043             :  *  CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
    7044             :  *  CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
    7045             :  *  CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
    7046             :  *      ignoring checkpoint_completion_target parameter.
    7047             :  *  CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
    7048             :  *      since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
    7049             :  *      CHECKPOINT_END_OF_RECOVERY).
    7050             :  *  CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
    7051             :  *
    7052             :  * Note: flags contains other bits, of interest here only for logging purposes.
    7053             :  * In particular note that this routine is synchronous and does not pay
    7054             :  * attention to CHECKPOINT_WAIT.
    7055             :  *
    7056             :  * If !shutdown then we are writing an online checkpoint. An XLOG_CHECKPOINT_REDO
    7057             :  * record is inserted into WAL at the logical location of the checkpoint, before
    7058             :  * flushing anything to disk, and when the checkpoint is eventually completed,
    7059             :  * and it is from this point that WAL replay will begin in the case of a recovery
    7060             :  * from this checkpoint. Once everything is written to disk, an
    7061             :  * XLOG_CHECKPOINT_ONLINE record is written to complete the checkpoint, and
    7062             :  * points back to the earlier XLOG_CHECKPOINT_REDO record. This mechanism allows
    7063             :  * other write-ahead log records to be written while the checkpoint is in
    7064             :  * progress, but we must be very careful about order of operations. This function
    7065             :  * may take many minutes to execute on a busy system.
    7066             :  *
    7067             :  * On the other hand, when shutdown is true, concurrent insertion into the
    7068             :  * write-ahead log is impossible, so there is no need for two separate records.
    7069             :  * In this case, we only insert an XLOG_CHECKPOINT_SHUTDOWN record, and it's
    7070             :  * both the record marking the completion of the checkpoint and the location
    7071             :  * from which WAL replay would begin if needed.
    7072             :  *
    7073             :  * Returns true if a new checkpoint was performed, or false if it was skipped
    7074             :  * because the system was idle.
    7075             :  */
    7076             : bool
    7077        2996 : CreateCheckPoint(int flags)
    7078             : {
    7079             :     bool        shutdown;
    7080             :     CheckPoint  checkPoint;
    7081             :     XLogRecPtr  recptr;
    7082             :     XLogSegNo   _logSegNo;
    7083        2996 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    7084             :     uint32      freespace;
    7085             :     XLogRecPtr  PriorRedoPtr;
    7086             :     XLogRecPtr  last_important_lsn;
    7087             :     VirtualTransactionId *vxids;
    7088             :     int         nvxids;
    7089        2996 :     int         oldXLogAllowed = 0;
    7090             : 
    7091             :     /*
    7092             :      * An end-of-recovery checkpoint is really a shutdown checkpoint, just
    7093             :      * issued at a different time.
    7094             :      */
    7095        2996 :     if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
    7096        1186 :         shutdown = true;
    7097             :     else
    7098        1810 :         shutdown = false;
    7099             : 
    7100             :     /* sanity check */
    7101        2996 :     if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
    7102           0 :         elog(ERROR, "can't create a checkpoint during recovery");
    7103             : 
    7104             :     /*
    7105             :      * Prepare to accumulate statistics.
    7106             :      *
    7107             :      * Note: because it is possible for log_checkpoints to change while a
    7108             :      * checkpoint proceeds, we always accumulate stats, even if
    7109             :      * log_checkpoints is currently off.
    7110             :      */
    7111       32956 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    7112        2996 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    7113             : 
    7114             :     /*
    7115             :      * Let smgr prepare for checkpoint; this has to happen outside the
    7116             :      * critical section and before we determine the REDO pointer.  Note that
    7117             :      * smgr must not do anything that'd have to be undone if we decide no
    7118             :      * checkpoint is needed.
    7119             :      */
    7120        2996 :     SyncPreCheckpoint();
    7121             : 
    7122             :     /*
    7123             :      * Use a critical section to force system panic if we have trouble.
    7124             :      */
    7125        2996 :     START_CRIT_SECTION();
    7126             : 
    7127        2996 :     if (shutdown)
    7128             :     {
    7129        1186 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7130        1186 :         ControlFile->state = DB_SHUTDOWNING;
    7131        1186 :         UpdateControlFile();
    7132        1186 :         LWLockRelease(ControlFileLock);
    7133             :     }
    7134             : 
    7135             :     /* Begin filling in the checkpoint WAL record */
    7136       35952 :     MemSet(&checkPoint, 0, sizeof(checkPoint));
    7137        2996 :     checkPoint.time = (pg_time_t) time(NULL);
    7138             : 
    7139             :     /*
    7140             :      * For Hot Standby, derive the oldestActiveXid before we fix the redo
    7141             :      * pointer. This allows us to begin accumulating changes to assemble our
    7142             :      * starting snapshot of locks and transactions.
    7143             :      */
    7144        2996 :     if (!shutdown && XLogStandbyInfoActive())
    7145        1712 :         checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
    7146             :     else
    7147        1284 :         checkPoint.oldestActiveXid = InvalidTransactionId;
    7148             : 
    7149             :     /*
    7150             :      * Get location of last important record before acquiring insert locks (as
    7151             :      * GetLastImportantRecPtr() also locks WAL locks).
    7152             :      */
    7153        2996 :     last_important_lsn = GetLastImportantRecPtr();
    7154             : 
    7155             :     /*
    7156             :      * If this isn't a shutdown or forced checkpoint, and if there has been no
    7157             :      * WAL activity requiring a checkpoint, skip it.  The idea here is to
    7158             :      * avoid inserting duplicate checkpoints when the system is idle.
    7159             :      */
    7160        2996 :     if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
    7161             :                   CHECKPOINT_FORCE)) == 0)
    7162             :     {
    7163         384 :         if (last_important_lsn == ControlFile->checkPoint)
    7164             :         {
    7165          10 :             END_CRIT_SECTION();
    7166          10 :             ereport(DEBUG1,
    7167             :                     (errmsg_internal("checkpoint skipped because system is idle")));
    7168          10 :             return false;
    7169             :         }
    7170             :     }
    7171             : 
    7172             :     /*
    7173             :      * An end-of-recovery checkpoint is created before anyone is allowed to
    7174             :      * write WAL. To allow us to write the checkpoint record, temporarily
    7175             :      * enable XLogInsertAllowed.
    7176             :      */
    7177        2986 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    7178          58 :         oldXLogAllowed = LocalSetXLogInsertAllowed();
    7179             : 
    7180        2986 :     checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
    7181        2986 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    7182          58 :         checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    7183             :     else
    7184        2928 :         checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
    7185             : 
    7186             :     /*
    7187             :      * We must block concurrent insertions while examining insert state.
    7188             :      */
    7189        2986 :     WALInsertLockAcquireExclusive();
    7190             : 
    7191        2986 :     checkPoint.fullPageWrites = Insert->fullPageWrites;
    7192        2986 :     checkPoint.wal_level = wal_level;
    7193             : 
    7194        2986 :     if (shutdown)
    7195             :     {
    7196        1186 :         XLogRecPtr  curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
    7197             : 
    7198             :         /*
    7199             :          * Compute new REDO record ptr = location of next XLOG record.
    7200             :          *
    7201             :          * Since this is a shutdown checkpoint, there can't be any concurrent
    7202             :          * WAL insertion.
    7203             :          */
    7204        1186 :         freespace = INSERT_FREESPACE(curInsert);
    7205        1186 :         if (freespace == 0)
    7206             :         {
    7207           0 :             if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
    7208           0 :                 curInsert += SizeOfXLogLongPHD;
    7209             :             else
    7210           0 :                 curInsert += SizeOfXLogShortPHD;
    7211             :         }
    7212        1186 :         checkPoint.redo = curInsert;
    7213             : 
    7214             :         /*
    7215             :          * Here we update the shared RedoRecPtr for future XLogInsert calls;
    7216             :          * this must be done while holding all the insertion locks.
    7217             :          *
    7218             :          * Note: if we fail to complete the checkpoint, RedoRecPtr will be
    7219             :          * left pointing past where it really needs to point.  This is okay;
    7220             :          * the only consequence is that XLogInsert might back up whole buffers
    7221             :          * that it didn't really need to.  We can't postpone advancing
    7222             :          * RedoRecPtr because XLogInserts that happen while we are dumping
    7223             :          * buffers must assume that their buffer changes are not included in
    7224             :          * the checkpoint.
    7225             :          */
    7226        1186 :         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    7227             :     }
    7228             : 
    7229             :     /*
    7230             :      * Now we can release the WAL insertion locks, allowing other xacts to
    7231             :      * proceed while we are flushing disk buffers.
    7232             :      */
    7233        2986 :     WALInsertLockRelease();
    7234             : 
    7235             :     /*
    7236             :      * If this is an online checkpoint, we have not yet determined the redo
    7237             :      * point. We do so now by inserting the special XLOG_CHECKPOINT_REDO
    7238             :      * record; the LSN at which it starts becomes the new redo pointer. We
    7239             :      * don't do this for a shutdown checkpoint, because in that case no WAL
    7240             :      * can be written between the redo point and the insertion of the
    7241             :      * checkpoint record itself, so the checkpoint record itself serves to
    7242             :      * mark the redo point.
    7243             :      */
    7244        2986 :     if (!shutdown)
    7245             :     {
    7246             :         /* Include WAL level in record for WAL summarizer's benefit. */
    7247        1800 :         XLogBeginInsert();
    7248        1800 :         XLogRegisterData(&wal_level, sizeof(wal_level));
    7249        1800 :         (void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
    7250             : 
    7251             :         /*
    7252             :          * XLogInsertRecord will have updated XLogCtl->Insert.RedoRecPtr in
    7253             :          * shared memory and RedoRecPtr in backend-local memory, but we need
    7254             :          * to copy that into the record that will be inserted when the
    7255             :          * checkpoint is complete.
    7256             :          */
    7257        1800 :         checkPoint.redo = RedoRecPtr;
    7258             :     }
    7259             : 
    7260             :     /* Update the info_lck-protected copy of RedoRecPtr as well */
    7261        2986 :     SpinLockAcquire(&XLogCtl->info_lck);
    7262        2986 :     XLogCtl->RedoRecPtr = checkPoint.redo;
    7263        2986 :     SpinLockRelease(&XLogCtl->info_lck);
    7264             : 
    7265             :     /*
    7266             :      * If enabled, log checkpoint start.  We postpone this until now so as not
    7267             :      * to log anything if we decided to skip the checkpoint.
    7268             :      */
    7269        2986 :     if (log_checkpoints)
    7270        2390 :         LogCheckpointStart(flags, false);
    7271             : 
    7272             :     /* Update the process title */
    7273        2986 :     update_checkpoint_display(flags, false, false);
    7274             : 
    7275             :     TRACE_POSTGRESQL_CHECKPOINT_START(flags);
    7276             : 
    7277             :     /*
    7278             :      * Get the other info we need for the checkpoint record.
    7279             :      *
    7280             :      * We don't need to save oldestClogXid in the checkpoint, it only matters
    7281             :      * for the short period in which clog is being truncated, and if we crash
    7282             :      * during that we'll redo the clog truncation and fix up oldestClogXid
    7283             :      * there.
    7284             :      */
    7285        2986 :     LWLockAcquire(XidGenLock, LW_SHARED);
    7286        2986 :     checkPoint.nextXid = TransamVariables->nextXid;
    7287        2986 :     checkPoint.oldestXid = TransamVariables->oldestXid;
    7288        2986 :     checkPoint.oldestXidDB = TransamVariables->oldestXidDB;
    7289        2986 :     LWLockRelease(XidGenLock);
    7290             : 
    7291        2986 :     LWLockAcquire(CommitTsLock, LW_SHARED);
    7292        2986 :     checkPoint.oldestCommitTsXid = TransamVariables->oldestCommitTsXid;
    7293        2986 :     checkPoint.newestCommitTsXid = TransamVariables->newestCommitTsXid;
    7294        2986 :     LWLockRelease(CommitTsLock);
    7295             : 
    7296        2986 :     LWLockAcquire(OidGenLock, LW_SHARED);
    7297        2986 :     checkPoint.nextOid = TransamVariables->nextOid;
    7298        2986 :     if (!shutdown)
    7299        1800 :         checkPoint.nextOid += TransamVariables->oidCount;
    7300        2986 :     LWLockRelease(OidGenLock);
    7301             : 
    7302        2986 :     MultiXactGetCheckptMulti(shutdown,
    7303             :                              &checkPoint.nextMulti,
    7304             :                              &checkPoint.nextMultiOffset,
    7305             :                              &checkPoint.oldestMulti,
    7306             :                              &checkPoint.oldestMultiDB);
    7307             : 
    7308             :     /*
    7309             :      * Having constructed the checkpoint record, ensure all shmem disk buffers
    7310             :      * and commit-log buffers are flushed to disk.
    7311             :      *
    7312             :      * This I/O could fail for various reasons.  If so, we will fail to
    7313             :      * complete the checkpoint, but there is no reason to force a system
    7314             :      * panic. Accordingly, exit critical section while doing it.
    7315             :      */
    7316        2986 :     END_CRIT_SECTION();
    7317             : 
    7318             :     /*
    7319             :      * In some cases there are groups of actions that must all occur on one
    7320             :      * side or the other of a checkpoint record. Before flushing the
    7321             :      * checkpoint record we must explicitly wait for any backend currently
    7322             :      * performing those groups of actions.
    7323             :      *
    7324             :      * One example is end of transaction, so we must wait for any transactions
    7325             :      * that are currently in commit critical sections.  If an xact inserted
    7326             :      * its commit record into XLOG just before the REDO point, then a crash
    7327             :      * restart from the REDO point would not replay that record, which means
    7328             :      * that our flushing had better include the xact's update of pg_xact.  So
    7329             :      * we wait till he's out of his commit critical section before proceeding.
    7330             :      * See notes in RecordTransactionCommit().
    7331             :      *
    7332             :      * Because we've already released the insertion locks, this test is a bit
    7333             :      * fuzzy: it is possible that we will wait for xacts we didn't really need
    7334             :      * to wait for.  But the delay should be short and it seems better to make
    7335             :      * checkpoint take a bit longer than to hold off insertions longer than
    7336             :      * necessary. (In fact, the whole reason we have this issue is that xact.c
    7337             :      * does commit record XLOG insertion and clog update as two separate steps
    7338             :      * protected by different locks, but again that seems best on grounds of
    7339             :      * minimizing lock contention.)
    7340             :      *
    7341             :      * A transaction that has not yet set delayChkptFlags when we look cannot
    7342             :      * be at risk, since it has not inserted its commit record yet; and one
    7343             :      * that's already cleared it is not at risk either, since it's done fixing
    7344             :      * clog and we will correctly flush the update below.  So we cannot miss
    7345             :      * any xacts we need to wait for.
    7346             :      */
    7347        2986 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
    7348        2986 :     if (nvxids > 0)
    7349             :     {
    7350             :         do
    7351             :         {
    7352             :             /*
    7353             :              * Keep absorbing fsync requests while we wait. There could even
    7354             :              * be a deadlock if we don't, if the process that prevents the
    7355             :              * checkpoint is trying to add a request to the queue.
    7356             :              */
    7357          98 :             AbsorbSyncRequests();
    7358             : 
    7359          98 :             pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_START);
    7360          98 :             pg_usleep(10000L);  /* wait for 10 msec */
    7361          98 :             pgstat_report_wait_end();
    7362          98 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
    7363             :                                               DELAY_CHKPT_START));
    7364             :     }
    7365        2986 :     pfree(vxids);
    7366             : 
    7367        2986 :     CheckPointGuts(checkPoint.redo, flags);
    7368             : 
    7369        2986 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
    7370        2986 :     if (nvxids > 0)
    7371             :     {
    7372             :         do
    7373             :         {
    7374           0 :             AbsorbSyncRequests();
    7375             : 
    7376           0 :             pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_COMPLETE);
    7377           0 :             pg_usleep(10000L);  /* wait for 10 msec */
    7378           0 :             pgstat_report_wait_end();
    7379           0 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
    7380             :                                               DELAY_CHKPT_COMPLETE));
    7381             :     }
    7382        2986 :     pfree(vxids);
    7383             : 
    7384             :     /*
    7385             :      * Take a snapshot of running transactions and write this to WAL. This
    7386             :      * allows us to reconstruct the state of running transactions during
    7387             :      * archive recovery, if required. Skip, if this info disabled.
    7388             :      *
    7389             :      * If we are shutting down, or Startup process is completing crash
    7390             :      * recovery we don't need to write running xact data.
    7391             :      */
    7392        2986 :     if (!shutdown && XLogStandbyInfoActive())
    7393        1702 :         LogStandbySnapshot();
    7394             : 
    7395        2986 :     START_CRIT_SECTION();
    7396             : 
    7397             :     /*
    7398             :      * Now insert the checkpoint record into XLOG.
    7399             :      */
    7400        2986 :     XLogBeginInsert();
    7401        2986 :     XLogRegisterData(&checkPoint, sizeof(checkPoint));
    7402        2986 :     recptr = XLogInsert(RM_XLOG_ID,
    7403             :                         shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
    7404             :                         XLOG_CHECKPOINT_ONLINE);
    7405             : 
    7406        2986 :     XLogFlush(recptr);
    7407             : 
    7408             :     /*
    7409             :      * We mustn't write any new WAL after a shutdown checkpoint, or it will be
    7410             :      * overwritten at next startup.  No-one should even try, this just allows
    7411             :      * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
    7412             :      * to just temporarily disable writing until the system has exited
    7413             :      * recovery.
    7414             :      */
    7415        2986 :     if (shutdown)
    7416             :     {
    7417        1186 :         if (flags & CHECKPOINT_END_OF_RECOVERY)
    7418          58 :             LocalXLogInsertAllowed = oldXLogAllowed;
    7419             :         else
    7420        1128 :             LocalXLogInsertAllowed = 0; /* never again write WAL */
    7421             :     }
    7422             : 
    7423             :     /*
    7424             :      * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
    7425             :      * = end of actual checkpoint record.
    7426             :      */
    7427        2986 :     if (shutdown && checkPoint.redo != ProcLastRecPtr)
    7428           0 :         ereport(PANIC,
    7429             :                 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
    7430             : 
    7431             :     /*
    7432             :      * Remember the prior checkpoint's redo ptr for
    7433             :      * UpdateCheckPointDistanceEstimate()
    7434             :      */
    7435        2986 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    7436             : 
    7437             :     /*
    7438             :      * Update the control file.
    7439             :      */
    7440        2986 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7441        2986 :     if (shutdown)
    7442        1186 :         ControlFile->state = DB_SHUTDOWNED;
    7443        2986 :     ControlFile->checkPoint = ProcLastRecPtr;
    7444        2986 :     ControlFile->checkPointCopy = checkPoint;
    7445             :     /* crash recovery should always recover to the end of WAL */
    7446        2986 :     ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
    7447        2986 :     ControlFile->minRecoveryPointTLI = 0;
    7448             : 
    7449             :     /*
    7450             :      * Persist unloggedLSN value. It's reset on crash recovery, so this goes
    7451             :      * unused on non-shutdown checkpoints, but seems useful to store it always
    7452             :      * for debugging purposes.
    7453             :      */
    7454        2986 :     ControlFile->unloggedLSN = pg_atomic_read_membarrier_u64(&XLogCtl->unloggedLSN);
    7455             : 
    7456        2986 :     UpdateControlFile();
    7457        2986 :     LWLockRelease(ControlFileLock);
    7458             : 
    7459             :     /* Update shared-memory copy of checkpoint XID/epoch */
    7460        2986 :     SpinLockAcquire(&XLogCtl->info_lck);
    7461        2986 :     XLogCtl->ckptFullXid = checkPoint.nextXid;
    7462        2986 :     SpinLockRelease(&XLogCtl->info_lck);
    7463             : 
    7464             :     /*
    7465             :      * We are now done with critical updates; no need for system panic if we
    7466             :      * have trouble while fooling with old log segments.
    7467             :      */
    7468        2986 :     END_CRIT_SECTION();
    7469             : 
    7470             :     /*
    7471             :      * WAL summaries end when the next XLOG_CHECKPOINT_REDO or
    7472             :      * XLOG_CHECKPOINT_SHUTDOWN record is reached. This is the first point
    7473             :      * where (a) we're not inside of a critical section and (b) we can be
    7474             :      * certain that the relevant record has been flushed to disk, which must
    7475             :      * happen before it can be summarized.
    7476             :      *
    7477             :      * If this is a shutdown checkpoint, then this happens reasonably
    7478             :      * promptly: we've only just inserted and flushed the
    7479             :      * XLOG_CHECKPOINT_SHUTDOWN record. If this is not a shutdown checkpoint,
    7480             :      * then this might not be very prompt at all: the XLOG_CHECKPOINT_REDO
    7481             :      * record was written before we began flushing data to disk, and that
    7482             :      * could be many minutes ago at this point. However, we don't XLogFlush()
    7483             :      * after inserting that record, so we're not guaranteed that it's on disk
    7484             :      * until after the above call that flushes the XLOG_CHECKPOINT_ONLINE
    7485             :      * record.
    7486             :      */
    7487        2986 :     WakeupWalSummarizer();
    7488             : 
    7489             :     /*
    7490             :      * Let smgr do post-checkpoint cleanup (eg, deleting old files).
    7491             :      */
    7492        2986 :     SyncPostCheckpoint();
    7493             : 
    7494             :     /*
    7495             :      * Update the average distance between checkpoints if the prior checkpoint
    7496             :      * exists.
    7497             :      */
    7498        2986 :     if (PriorRedoPtr != InvalidXLogRecPtr)
    7499        2986 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    7500             : 
    7501             : #ifdef USE_INJECTION_POINTS
    7502        2986 :     INJECTION_POINT("checkpoint-before-old-wal-removal", NULL);
    7503             : #endif
    7504             : 
    7505             :     /*
    7506             :      * Delete old log files, those no longer needed for last checkpoint to
    7507             :      * prevent the disk holding the xlog from growing full.
    7508             :      */
    7509        2986 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    7510        2986 :     KeepLogSeg(recptr, &_logSegNo);
    7511        2986 :     if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED | RS_INVAL_IDLE_TIMEOUT,
    7512             :                                            _logSegNo, InvalidOid,
    7513             :                                            InvalidTransactionId))
    7514             :     {
    7515             :         /*
    7516             :          * Some slots have been invalidated; recalculate the old-segment
    7517             :          * horizon, starting again from RedoRecPtr.
    7518             :          */
    7519           6 :         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    7520           6 :         KeepLogSeg(recptr, &_logSegNo);
    7521             :     }
    7522        2986 :     _logSegNo--;
    7523        2986 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
    7524             :                        checkPoint.ThisTimeLineID);
    7525             : 
    7526             :     /*
    7527             :      * Make more log segments if needed.  (Do this after recycling old log
    7528             :      * segments, since that may supply some of the needed files.)
    7529             :      */
    7530        2986 :     if (!shutdown)
    7531        1800 :         PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
    7532             : 
    7533             :     /*
    7534             :      * Truncate pg_subtrans if possible.  We can throw away all data before
    7535             :      * the oldest XMIN of any running transaction.  No future transaction will
    7536             :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    7537             :      * in subtrans.c).  During recovery, though, we mustn't do this because
    7538             :      * StartupSUBTRANS hasn't been called yet.
    7539             :      */
    7540        2986 :     if (!RecoveryInProgress())
    7541        2928 :         TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
    7542             : 
    7543             :     /* Real work is done; log and update stats. */
    7544        2986 :     LogCheckpointEnd(false);
    7545             : 
    7546             :     /* Reset the process title */
    7547        2986 :     update_checkpoint_display(flags, false, true);
    7548             : 
    7549             :     TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
    7550             :                                      NBuffers,
    7551             :                                      CheckpointStats.ckpt_segs_added,
    7552             :                                      CheckpointStats.ckpt_segs_removed,
    7553             :                                      CheckpointStats.ckpt_segs_recycled);
    7554             : 
    7555        2986 :     return true;
    7556             : }
    7557             : 
    7558             : /*
    7559             :  * Mark the end of recovery in WAL though without running a full checkpoint.
    7560             :  * We can expect that a restartpoint is likely to be in progress as we
    7561             :  * do this, though we are unwilling to wait for it to complete.
    7562             :  *
    7563             :  * CreateRestartPoint() allows for the case where recovery may end before
    7564             :  * the restartpoint completes so there is no concern of concurrent behaviour.
    7565             :  */
    7566             : static void
    7567          84 : CreateEndOfRecoveryRecord(void)
    7568             : {
    7569             :     xl_end_of_recovery xlrec;
    7570             :     XLogRecPtr  recptr;
    7571             : 
    7572             :     /* sanity check */
    7573          84 :     if (!RecoveryInProgress())
    7574           0 :         elog(ERROR, "can only be used to end recovery");
    7575             : 
    7576          84 :     xlrec.end_time = GetCurrentTimestamp();
    7577          84 :     xlrec.wal_level = wal_level;
    7578             : 
    7579          84 :     WALInsertLockAcquireExclusive();
    7580          84 :     xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
    7581          84 :     xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    7582          84 :     WALInsertLockRelease();
    7583             : 
    7584          84 :     START_CRIT_SECTION();
    7585             : 
    7586          84 :     XLogBeginInsert();
    7587          84 :     XLogRegisterData(&xlrec, sizeof(xl_end_of_recovery));
    7588          84 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
    7589             : 
    7590          84 :     XLogFlush(recptr);
    7591             : 
    7592             :     /*
    7593             :      * Update the control file so that crash recovery can follow the timeline
    7594             :      * changes to this point.
    7595             :      */
    7596          84 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7597          84 :     ControlFile->minRecoveryPoint = recptr;
    7598          84 :     ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
    7599          84 :     UpdateControlFile();
    7600          84 :     LWLockRelease(ControlFileLock);
    7601             : 
    7602          84 :     END_CRIT_SECTION();
    7603          84 : }
    7604             : 
    7605             : /*
    7606             :  * Write an OVERWRITE_CONTRECORD message.
    7607             :  *
    7608             :  * When on WAL replay we expect a continuation record at the start of a page
    7609             :  * that is not there, recovery ends and WAL writing resumes at that point.
    7610             :  * But it's wrong to resume writing new WAL back at the start of the record
    7611             :  * that was broken, because downstream consumers of that WAL (physical
    7612             :  * replicas) are not prepared to "rewind".  So the first action after
    7613             :  * finishing replay of all valid WAL must be to write a record of this type
    7614             :  * at the point where the contrecord was missing; to support xlogreader
    7615             :  * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
    7616             :  * to the page header where the record occurs.  xlogreader has an ad-hoc
    7617             :  * mechanism to report metadata about the broken record, which is what we
    7618             :  * use here.
    7619             :  *
    7620             :  * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
    7621             :  * skip the record it was reading, and pass back the LSN of the skipped
    7622             :  * record, so that its caller can verify (on "replay" of that record) that the
    7623             :  * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
    7624             :  *
    7625             :  * 'aborted_lsn' is the beginning position of the record that was incomplete.
    7626             :  * It is included in the WAL record.  'pagePtr' and 'newTLI' point to the
    7627             :  * beginning of the XLOG page where the record is to be inserted.  They must
    7628             :  * match the current WAL insert position, they're passed here just so that we
    7629             :  * can verify that.
    7630             :  */
    7631             : static XLogRecPtr
    7632          20 : CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
    7633             :                                 TimeLineID newTLI)
    7634             : {
    7635             :     xl_overwrite_contrecord xlrec;
    7636             :     XLogRecPtr  recptr;
    7637             :     XLogPageHeader pagehdr;
    7638             :     XLogRecPtr  startPos;
    7639             : 
    7640             :     /* sanity checks */
    7641          20 :     if (!RecoveryInProgress())
    7642           0 :         elog(ERROR, "can only be used at end of recovery");
    7643          20 :     if (pagePtr % XLOG_BLCKSZ != 0)
    7644           0 :         elog(ERROR, "invalid position for missing continuation record %X/%X",
    7645             :              LSN_FORMAT_ARGS(pagePtr));
    7646             : 
    7647             :     /* The current WAL insert position should be right after the page header */
    7648          20 :     startPos = pagePtr;
    7649          20 :     if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
    7650           2 :         startPos += SizeOfXLogLongPHD;
    7651             :     else
    7652          18 :         startPos += SizeOfXLogShortPHD;
    7653          20 :     recptr = GetXLogInsertRecPtr();
    7654          20 :     if (recptr != startPos)
    7655           0 :         elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD",
    7656             :              LSN_FORMAT_ARGS(recptr));
    7657             : 
    7658          20 :     START_CRIT_SECTION();
    7659             : 
    7660             :     /*
    7661             :      * Initialize the XLOG page header (by GetXLogBuffer), and set the
    7662             :      * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
    7663             :      *
    7664             :      * No other backend is allowed to write WAL yet, so acquiring the WAL
    7665             :      * insertion lock is just pro forma.
    7666             :      */
    7667          20 :     WALInsertLockAcquire();
    7668          20 :     pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
    7669          20 :     pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
    7670          20 :     WALInsertLockRelease();
    7671             : 
    7672             :     /*
    7673             :      * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
    7674             :      * page.  We know it becomes the first record, because no other backend is
    7675             :      * allowed to write WAL yet.
    7676             :      */
    7677          20 :     XLogBeginInsert();
    7678          20 :     xlrec.overwritten_lsn = aborted_lsn;
    7679          20 :     xlrec.overwrite_time = GetCurrentTimestamp();
    7680          20 :     XLogRegisterData(&xlrec, sizeof(xl_overwrite_contrecord));
    7681          20 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
    7682             : 
    7683             :     /* check that the record was inserted to the right place */
    7684          20 :     if (ProcLastRecPtr != startPos)
    7685           0 :         elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X",
    7686             :              LSN_FORMAT_ARGS(ProcLastRecPtr));
    7687             : 
    7688          20 :     XLogFlush(recptr);
    7689             : 
    7690          20 :     END_CRIT_SECTION();
    7691             : 
    7692          20 :     return recptr;
    7693             : }
    7694             : 
    7695             : /*
    7696             :  * Flush all data in shared memory to disk, and fsync
    7697             :  *
    7698             :  * This is the common code shared between regular checkpoints and
    7699             :  * recovery restartpoints.
    7700             :  */
    7701             : static void
    7702        3368 : CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
    7703             : {
    7704        3368 :     CheckPointRelationMap();
    7705        3368 :     CheckPointReplicationSlots(flags & CHECKPOINT_IS_SHUTDOWN);
    7706        3368 :     CheckPointSnapBuild();
    7707        3368 :     CheckPointLogicalRewriteHeap();
    7708        3368 :     CheckPointReplicationOrigin();
    7709             : 
    7710             :     /* Write out all dirty data in SLRUs and the main buffer pool */
    7711             :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
    7712        3368 :     CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
    7713        3368 :     CheckPointCLOG();
    7714        3368 :     CheckPointCommitTs();
    7715        3368 :     CheckPointSUBTRANS();
    7716        3368 :     CheckPointMultiXact();
    7717        3368 :     CheckPointPredicate();
    7718        3368 :     CheckPointBuffers(flags);
    7719             : 
    7720             :     /* Perform all queued up fsyncs */
    7721             :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
    7722        3368 :     CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
    7723        3368 :     ProcessSyncRequests();
    7724        3368 :     CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
    7725             :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
    7726             : 
    7727             :     /* We deliberately delay 2PC checkpointing as long as possible */
    7728        3368 :     CheckPointTwoPhase(checkPointRedo);
    7729        3368 : }
    7730             : 
    7731             : /*
    7732             :  * Save a checkpoint for recovery restart if appropriate
    7733             :  *
    7734             :  * This function is called each time a checkpoint record is read from XLOG.
    7735             :  * It must determine whether the checkpoint represents a safe restartpoint or
    7736             :  * not.  If so, the checkpoint record is stashed in shared memory so that
    7737             :  * CreateRestartPoint can consult it.  (Note that the latter function is
    7738             :  * executed by the checkpointer, while this one will be executed by the
    7739             :  * startup process.)
    7740             :  */
    7741             : static void
    7742        1414 : RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
    7743             : {
    7744             :     /*
    7745             :      * Also refrain from creating a restartpoint if we have seen any
    7746             :      * references to non-existent pages. Restarting recovery from the
    7747             :      * restartpoint would not see the references, so we would lose the
    7748             :      * cross-check that the pages belonged to a relation that was dropped
    7749             :      * later.
    7750             :      */
    7751        1414 :     if (XLogHaveInvalidPages())
    7752             :     {
    7753           0 :         elog(DEBUG2,
    7754             :              "could not record restart point at %X/%X because there "
    7755             :              "are unresolved references to invalid pages",
    7756             :              LSN_FORMAT_ARGS(checkPoint->redo));
    7757           0 :         return;
    7758             :     }
    7759             : 
    7760             :     /*
    7761             :      * Copy the checkpoint record to shared memory, so that checkpointer can
    7762             :      * work out the next time it wants to perform a restartpoint.
    7763             :      */
    7764        1414 :     SpinLockAcquire(&XLogCtl->info_lck);
    7765        1414 :     XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
    7766        1414 :     XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
    7767        1414 :     XLogCtl->lastCheckPoint = *checkPoint;
    7768        1414 :     SpinLockRelease(&XLogCtl->info_lck);
    7769             : }
    7770             : 
    7771             : /*
    7772             :  * Establish a restartpoint if possible.
    7773             :  *
    7774             :  * This is similar to CreateCheckPoint, but is used during WAL recovery
    7775             :  * to establish a point from which recovery can roll forward without
    7776             :  * replaying the entire recovery log.
    7777             :  *
    7778             :  * Returns true if a new restartpoint was established. We can only establish
    7779             :  * a restartpoint if we have replayed a safe checkpoint record since last
    7780             :  * restartpoint.
    7781             :  */
    7782             : bool
    7783        1206 : CreateRestartPoint(int flags)
    7784             : {
    7785             :     XLogRecPtr  lastCheckPointRecPtr;
    7786             :     XLogRecPtr  lastCheckPointEndPtr;
    7787             :     CheckPoint  lastCheckPoint;
    7788             :     XLogRecPtr  PriorRedoPtr;
    7789             :     XLogRecPtr  receivePtr;
    7790             :     XLogRecPtr  replayPtr;
    7791             :     TimeLineID  replayTLI;
    7792             :     XLogRecPtr  endptr;
    7793             :     XLogSegNo   _logSegNo;
    7794             :     TimestampTz xtime;
    7795             : 
    7796             :     /* Concurrent checkpoint/restartpoint cannot happen */
    7797             :     Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
    7798             : 
    7799             :     /* Get a local copy of the last safe checkpoint record. */
    7800        1206 :     SpinLockAcquire(&XLogCtl->info_lck);
    7801        1206 :     lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
    7802        1206 :     lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
    7803        1206 :     lastCheckPoint = XLogCtl->lastCheckPoint;
    7804        1206 :     SpinLockRelease(&XLogCtl->info_lck);
    7805             : 
    7806             :     /*
    7807             :      * Check that we're still in recovery mode. It's ok if we exit recovery
    7808             :      * mode after this check, the restart point is valid anyway.
    7809             :      */
    7810        1206 :     if (!RecoveryInProgress())
    7811             :     {
    7812           0 :         ereport(DEBUG2,
    7813             :                 (errmsg_internal("skipping restartpoint, recovery has already ended")));
    7814           0 :         return false;
    7815             :     }
    7816             : 
    7817             :     /*
    7818             :      * If the last checkpoint record we've replayed is already our last
    7819             :      * restartpoint, we can't perform a new restart point. We still update
    7820             :      * minRecoveryPoint in that case, so that if this is a shutdown restart
    7821             :      * point, we won't start up earlier than before. That's not strictly
    7822             :      * necessary, but when hot standby is enabled, it would be rather weird if
    7823             :      * the database opened up for read-only connections at a point-in-time
    7824             :      * before the last shutdown. Such time travel is still possible in case of
    7825             :      * immediate shutdown, though.
    7826             :      *
    7827             :      * We don't explicitly advance minRecoveryPoint when we do create a
    7828             :      * restartpoint. It's assumed that flushing the buffers will do that as a
    7829             :      * side-effect.
    7830             :      */
    7831        1206 :     if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
    7832         542 :         lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
    7833             :     {
    7834         824 :         ereport(DEBUG2,
    7835             :                 (errmsg_internal("skipping restartpoint, already performed at %X/%X",
    7836             :                                  LSN_FORMAT_ARGS(lastCheckPoint.redo))));
    7837             : 
    7838         824 :         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    7839         824 :         if (flags & CHECKPOINT_IS_SHUTDOWN)
    7840             :         {
    7841          68 :             LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7842          68 :             ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    7843          68 :             UpdateControlFile();
    7844          68 :             LWLockRelease(ControlFileLock);
    7845             :         }
    7846         824 :         return false;
    7847             :     }
    7848             : 
    7849             :     /*
    7850             :      * Update the shared RedoRecPtr so that the startup process can calculate
    7851             :      * the number of segments replayed since last restartpoint, and request a
    7852             :      * restartpoint if it exceeds CheckPointSegments.
    7853             :      *
    7854             :      * Like in CreateCheckPoint(), hold off insertions to update it, although
    7855             :      * during recovery this is just pro forma, because no WAL insertions are
    7856             :      * happening.
    7857             :      */
    7858         382 :     WALInsertLockAcquireExclusive();
    7859         382 :     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
    7860         382 :     WALInsertLockRelease();
    7861             : 
    7862             :     /* Also update the info_lck-protected copy */
    7863         382 :     SpinLockAcquire(&XLogCtl->info_lck);
    7864         382 :     XLogCtl->RedoRecPtr = lastCheckPoint.redo;
    7865         382 :     SpinLockRelease(&XLogCtl->info_lck);
    7866             : 
    7867             :     /*
    7868             :      * Prepare to accumulate statistics.
    7869             :      *
    7870             :      * Note: because it is possible for log_checkpoints to change while a
    7871             :      * checkpoint proceeds, we always accumulate stats, even if
    7872             :      * log_checkpoints is currently off.
    7873             :      */
    7874        4202 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    7875         382 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    7876             : 
    7877         382 :     if (log_checkpoints)
    7878         382 :         LogCheckpointStart(flags, true);
    7879             : 
    7880             :     /* Update the process title */
    7881         382 :     update_checkpoint_display(flags, true, false);
    7882             : 
    7883         382 :     CheckPointGuts(lastCheckPoint.redo, flags);
    7884             : 
    7885             :     /*
    7886             :      * This location needs to be after CheckPointGuts() to ensure that some
    7887             :      * work has already happened during this checkpoint.
    7888             :      */
    7889         382 :     INJECTION_POINT("create-restart-point", NULL);
    7890             : 
    7891             :     /*
    7892             :      * Remember the prior checkpoint's redo ptr for
    7893             :      * UpdateCheckPointDistanceEstimate()
    7894             :      */
    7895         382 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    7896             : 
    7897             :     /*
    7898             :      * Update pg_control, using current time.  Check that it still shows an
    7899             :      * older checkpoint, else do nothing; this is a quick hack to make sure
    7900             :      * nothing really bad happens if somehow we get here after the
    7901             :      * end-of-recovery checkpoint.
    7902             :      */
    7903         382 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7904         382 :     if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
    7905             :     {
    7906             :         /*
    7907             :          * Update the checkpoint information.  We do this even if the cluster
    7908             :          * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL
    7909             :          * segments recycled below.
    7910             :          */
    7911         382 :         ControlFile->checkPoint = lastCheckPointRecPtr;
    7912         382 :         ControlFile->checkPointCopy = lastCheckPoint;
    7913             : 
    7914             :         /*
    7915             :          * Ensure minRecoveryPoint is past the checkpoint record and update it
    7916             :          * if the control file still shows DB_IN_ARCHIVE_RECOVERY.  Normally,
    7917             :          * this will have happened already while writing out dirty buffers,
    7918             :          * but not necessarily - e.g. because no buffers were dirtied.  We do
    7919             :          * this because a backup performed in recovery uses minRecoveryPoint
    7920             :          * to determine which WAL files must be included in the backup, and
    7921             :          * the file (or files) containing the checkpoint record must be
    7922             :          * included, at a minimum.  Note that for an ordinary restart of
    7923             :          * recovery there's no value in having the minimum recovery point any
    7924             :          * earlier than this anyway, because redo will begin just after the
    7925             :          * checkpoint record.
    7926             :          */
    7927         382 :         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
    7928             :         {
    7929         382 :             if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
    7930             :             {
    7931          40 :                 ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
    7932          40 :                 ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
    7933             : 
    7934             :                 /* update local copy */
    7935          40 :                 LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    7936          40 :                 LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    7937             :             }
    7938         382 :             if (flags & CHECKPOINT_IS_SHUTDOWN)
    7939          42 :                 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    7940             :         }
    7941         382 :         UpdateControlFile();
    7942             :     }
    7943         382 :     LWLockRelease(ControlFileLock);
    7944             : 
    7945             :     /*
    7946             :      * Update the average distance between checkpoints/restartpoints if the
    7947             :      * prior checkpoint exists.
    7948             :      */
    7949         382 :     if (PriorRedoPtr != InvalidXLogRecPtr)
    7950         382 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    7951             : 
    7952             :     /*
    7953             :      * Delete old log files, those no longer needed for last restartpoint to
    7954             :      * prevent the disk holding the xlog from growing full.
    7955             :      */
    7956         382 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    7957             : 
    7958             :     /*
    7959             :      * Retreat _logSegNo using the current end of xlog replayed or received,
    7960             :      * whichever is later.
    7961             :      */
    7962         382 :     receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
    7963         382 :     replayPtr = GetXLogReplayRecPtr(&replayTLI);
    7964         382 :     endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
    7965         382 :     KeepLogSeg(endptr, &_logSegNo);
    7966         382 :     if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED | RS_INVAL_IDLE_TIMEOUT,
    7967             :                                            _logSegNo, InvalidOid,
    7968             :                                            InvalidTransactionId))
    7969             :     {
    7970             :         /*
    7971             :          * Some slots have been invalidated; recalculate the old-segment
    7972             :          * horizon, starting again from RedoRecPtr.
    7973             :          */
    7974           2 :         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    7975           2 :         KeepLogSeg(endptr, &_logSegNo);
    7976             :     }
    7977         382 :     _logSegNo--;
    7978             : 
    7979             :     /*
    7980             :      * Try to recycle segments on a useful timeline. If we've been promoted
    7981             :      * since the beginning of this restartpoint, use the new timeline chosen
    7982             :      * at end of recovery.  If we're still in recovery, use the timeline we're
    7983             :      * currently replaying.
    7984             :      *
    7985             :      * There is no guarantee that the WAL segments will be useful on the
    7986             :      * current timeline; if recovery proceeds to a new timeline right after
    7987             :      * this, the pre-allocated WAL segments on this timeline will not be used,
    7988             :      * and will go wasted until recycled on the next restartpoint. We'll live
    7989             :      * with that.
    7990             :      */
    7991         382 :     if (!RecoveryInProgress())
    7992           0 :         replayTLI = XLogCtl->InsertTimeLineID;
    7993             : 
    7994         382 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
    7995             : 
    7996             :     /*
    7997             :      * Make more log segments if needed.  (Do this after recycling old log
    7998             :      * segments, since that may supply some of the needed files.)
    7999             :      */
    8000         382 :     PreallocXlogFiles(endptr, replayTLI);
    8001             : 
    8002             :     /*
    8003             :      * Truncate pg_subtrans if possible.  We can throw away all data before
    8004             :      * the oldest XMIN of any running transaction.  No future transaction will
    8005             :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    8006             :      * in subtrans.c).  When hot standby is disabled, though, we mustn't do
    8007             :      * this because StartupSUBTRANS hasn't been called yet.
    8008             :      */
    8009         382 :     if (EnableHotStandby)
    8010         382 :         TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
    8011             : 
    8012             :     /* Real work is done; log and update stats. */
    8013         382 :     LogCheckpointEnd(true);
    8014             : 
    8015             :     /* Reset the process title */
    8016         382 :     update_checkpoint_display(flags, true, true);
    8017             : 
    8018         382 :     xtime = GetLatestXTime();
    8019         382 :     ereport((log_checkpoints ? LOG : DEBUG2),
    8020             :             (errmsg("recovery restart point at %X/%X",
    8021             :                     LSN_FORMAT_ARGS(lastCheckPoint.redo)),
    8022             :              xtime ? errdetail("Last completed transaction was at log time %s.",
    8023             :                                timestamptz_to_str(xtime)) : 0));
    8024             : 
    8025             :     /*
    8026             :      * Finally, execute archive_cleanup_command, if any.
    8027             :      */
    8028         382 :     if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
    8029           0 :         ExecuteRecoveryCommand(archiveCleanupCommand,
    8030             :                                "archive_cleanup_command",
    8031             :                                false,
    8032             :                                WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
    8033             : 
    8034         382 :     return true;
    8035             : }
    8036             : 
    8037             : /*
    8038             :  * Report availability of WAL for the given target LSN
    8039             :  *      (typically a slot's restart_lsn)
    8040             :  *
    8041             :  * Returns one of the following enum values:
    8042             :  *
    8043             :  * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
    8044             :  *   max_wal_size.
    8045             :  *
    8046             :  * * WALAVAIL_EXTENDED means it is still available by preserving extra
    8047             :  *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
    8048             :  *   than max_wal_size, this state is not returned.
    8049             :  *
    8050             :  * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
    8051             :  *   remove reserved segments. The walsender using this slot may return to the
    8052             :  *   above.
    8053             :  *
    8054             :  * * WALAVAIL_REMOVED means it has been removed. A replication stream on
    8055             :  *   a slot with this LSN cannot continue.  (Any associated walsender
    8056             :  *   processes should have been terminated already.)
    8057             :  *
    8058             :  * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
    8059             :  */
    8060             : WALAvailability
    8061         958 : GetWALAvailability(XLogRecPtr targetLSN)
    8062             : {
    8063             :     XLogRecPtr  currpos;        /* current write LSN */
    8064             :     XLogSegNo   currSeg;        /* segid of currpos */
    8065             :     XLogSegNo   targetSeg;      /* segid of targetLSN */
    8066             :     XLogSegNo   oldestSeg;      /* actual oldest segid */
    8067             :     XLogSegNo   oldestSegMaxWalSize;    /* oldest segid kept by max_wal_size */
    8068             :     XLogSegNo   oldestSlotSeg;  /* oldest segid kept by slot */
    8069             :     uint64      keepSegs;
    8070             : 
    8071             :     /*
    8072             :      * slot does not reserve WAL. Either deactivated, or has never been active
    8073             :      */
    8074         958 :     if (XLogRecPtrIsInvalid(targetLSN))
    8075          34 :         return WALAVAIL_INVALID_LSN;
    8076             : 
    8077             :     /*
    8078             :      * Calculate the oldest segment currently reserved by all slots,
    8079             :      * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
    8080             :      * oldestSlotSeg to the current segment.
    8081             :      */
    8082         924 :     currpos = GetXLogWriteRecPtr();
    8083         924 :     XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
    8084         924 :     KeepLogSeg(currpos, &oldestSlotSeg);
    8085             : 
    8086             :     /*
    8087             :      * Find the oldest extant segment file. We get 1 until checkpoint removes
    8088             :      * the first WAL segment file since startup, which causes the status being
    8089             :      * wrong under certain abnormal conditions but that doesn't actually harm.
    8090             :      */
    8091         924 :     oldestSeg = XLogGetLastRemovedSegno() + 1;
    8092             : 
    8093             :     /* calculate oldest segment by max_wal_size */
    8094         924 :     XLByteToSeg(currpos, currSeg, wal_segment_size);
    8095         924 :     keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
    8096             : 
    8097         924 :     if (currSeg > keepSegs)
    8098          16 :         oldestSegMaxWalSize = currSeg - keepSegs;
    8099             :     else
    8100         908 :         oldestSegMaxWalSize = 1;
    8101             : 
    8102             :     /* the segment we care about */
    8103         924 :     XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
    8104             : 
    8105             :     /*
    8106             :      * No point in returning reserved or extended status values if the
    8107             :      * targetSeg is known to be lost.
    8108             :      */
    8109         924 :     if (targetSeg >= oldestSlotSeg)
    8110             :     {
    8111             :         /* show "reserved" when targetSeg is within max_wal_size */
    8112         922 :         if (targetSeg >= oldestSegMaxWalSize)
    8113         918 :             return WALAVAIL_RESERVED;
    8114             : 
    8115             :         /* being retained by slots exceeding max_wal_size */
    8116           4 :         return WALAVAIL_EXTENDED;
    8117             :     }
    8118             : 
    8119             :     /* WAL segments are no longer retained but haven't been removed yet */
    8120           2 :     if (targetSeg >= oldestSeg)
    8121           2 :         return WALAVAIL_UNRESERVED;
    8122             : 
    8123             :     /* Definitely lost */
    8124           0 :     return WALAVAIL_REMOVED;
    8125             : }
    8126             : 
    8127             : 
    8128             : /*
    8129             :  * Retreat *logSegNo to the last segment that we need to retain because of
    8130             :  * either wal_keep_size or replication slots.
    8131             :  *
    8132             :  * This is calculated by subtracting wal_keep_size from the given xlog
    8133             :  * location, recptr and by making sure that that result is below the
    8134             :  * requirement of replication slots.  For the latter criterion we do consider
    8135             :  * the effects of max_slot_wal_keep_size: reserve at most that much space back
    8136             :  * from recptr.
    8137             :  *
    8138             :  * Note about replication slots: if this function calculates a value
    8139             :  * that's further ahead than what slots need reserved, then affected
    8140             :  * slots need to be invalidated and this function invoked again.
    8141             :  * XXX it might be a good idea to rewrite this function so that
    8142             :  * invalidation is optionally done here, instead.
    8143             :  */
    8144             : static void
    8145        4300 : KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
    8146             : {
    8147             :     XLogSegNo   currSegNo;
    8148             :     XLogSegNo   segno;
    8149             :     XLogRecPtr  keep;
    8150             : 
    8151        4300 :     XLByteToSeg(recptr, currSegNo, wal_segment_size);
    8152        4300 :     segno = currSegNo;
    8153             : 
    8154             :     /*
    8155             :      * Calculate how many segments are kept by slots first, adjusting for
    8156             :      * max_slot_wal_keep_size.
    8157             :      */
    8158        4300 :     keep = XLogGetReplicationSlotMinimumLSN();
    8159        4300 :     if (keep != InvalidXLogRecPtr && keep < recptr)
    8160             :     {
    8161        1184 :         XLByteToSeg(keep, segno, wal_segment_size);
    8162             : 
    8163             :         /* Cap by max_slot_wal_keep_size ... */
    8164        1184 :         if (max_slot_wal_keep_size_mb >= 0)
    8165             :         {
    8166             :             uint64      slot_keep_segs;
    8167             : 
    8168          42 :             slot_keep_segs =
    8169          42 :                 ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
    8170             : 
    8171          42 :             if (currSegNo - segno > slot_keep_segs)
    8172          10 :                 segno = currSegNo - slot_keep_segs;
    8173             :         }
    8174             :     }
    8175             : 
    8176             :     /*
    8177             :      * If WAL summarization is in use, don't remove WAL that has yet to be
    8178             :      * summarized.
    8179             :      */
    8180        4300 :     keep = GetOldestUnsummarizedLSN(NULL, NULL);
    8181        4300 :     if (keep != InvalidXLogRecPtr)
    8182             :     {
    8183             :         XLogSegNo   unsummarized_segno;
    8184             : 
    8185           4 :         XLByteToSeg(keep, unsummarized_segno, wal_segment_size);
    8186           4 :         if (unsummarized_segno < segno)
    8187           4 :             segno = unsummarized_segno;
    8188             :     }
    8189             : 
    8190             :     /* but, keep at least wal_keep_size if that's set */
    8191        4300 :     if (wal_keep_size_mb > 0)
    8192             :     {
    8193             :         uint64      keep_segs;
    8194             : 
    8195         138 :         keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
    8196         138 :         if (currSegNo - segno < keep_segs)
    8197             :         {
    8198             :             /* avoid underflow, don't go below 1 */
    8199         138 :             if (currSegNo <= keep_segs)
    8200         130 :                 segno = 1;
    8201             :             else
    8202           8 :                 segno = currSegNo - keep_segs;
    8203             :         }
    8204             :     }
    8205             : 
    8206             :     /* don't delete WAL segments newer than the calculated segment */
    8207        4300 :     if (segno < *logSegNo)
    8208         666 :         *logSegNo = segno;
    8209        4300 : }
    8210             : 
    8211             : /*
    8212             :  * Write a NEXTOID log record
    8213             :  */
    8214             : void
    8215        1190 : XLogPutNextOid(Oid nextOid)
    8216             : {
    8217        1190 :     XLogBeginInsert();
    8218        1190 :     XLogRegisterData(&nextOid, sizeof(Oid));
    8219        1190 :     (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
    8220             : 
    8221             :     /*
    8222             :      * We need not flush the NEXTOID record immediately, because any of the
    8223             :      * just-allocated OIDs could only reach disk as part of a tuple insert or
    8224             :      * update that would have its own XLOG record that must follow the NEXTOID
    8225             :      * record.  Therefore, the standard buffer LSN interlock applied to those
    8226             :      * records will ensure no such OID reaches disk before the NEXTOID record
    8227             :      * does.
    8228             :      *
    8229             :      * Note, however, that the above statement only covers state "within" the
    8230             :      * database.  When we use a generated OID as a file or directory name, we
    8231             :      * are in a sense violating the basic WAL rule, because that filesystem
    8232             :      * change may reach disk before the NEXTOID WAL record does.  The impact
    8233             :      * of this is that if a database crash occurs immediately afterward, we
    8234             :      * might after restart re-generate the same OID and find that it conflicts
    8235             :      * with the leftover file or directory.  But since for safety's sake we
    8236             :      * always loop until finding a nonconflicting filename, this poses no real
    8237             :      * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
    8238             :      */
    8239        1190 : }
    8240             : 
    8241             : /*
    8242             :  * Write an XLOG SWITCH record.
    8243             :  *
    8244             :  * Here we just blindly issue an XLogInsert request for the record.
    8245             :  * All the magic happens inside XLogInsert.
    8246             :  *
    8247             :  * The return value is either the end+1 address of the switch record,
    8248             :  * or the end+1 address of the prior segment if we did not need to
    8249             :  * write a switch record because we are already at segment start.
    8250             :  */
    8251             : XLogRecPtr
    8252        1470 : RequestXLogSwitch(bool mark_unimportant)
    8253             : {
    8254             :     XLogRecPtr  RecPtr;
    8255             : 
    8256             :     /* XLOG SWITCH has no data */
    8257        1470 :     XLogBeginInsert();
    8258             : 
    8259        1470 :     if (mark_unimportant)
    8260           0 :         XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    8261        1470 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
    8262             : 
    8263        1470 :     return RecPtr;
    8264             : }
    8265             : 
    8266             : /*
    8267             :  * Write a RESTORE POINT record
    8268             :  */
    8269             : XLogRecPtr
    8270           6 : XLogRestorePoint(const char *rpName)
    8271             : {
    8272             :     XLogRecPtr  RecPtr;
    8273             :     xl_restore_point xlrec;
    8274             : 
    8275           6 :     xlrec.rp_time = GetCurrentTimestamp();
    8276           6 :     strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
    8277             : 
    8278           6 :     XLogBeginInsert();
    8279           6 :     XLogRegisterData(&xlrec, sizeof(xl_restore_point));
    8280             : 
    8281           6 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
    8282             : 
    8283           6 :     ereport(LOG,
    8284             :             (errmsg("restore point \"%s\" created at %X/%X",
    8285             :                     rpName, LSN_FORMAT_ARGS(RecPtr))));
    8286             : 
    8287           6 :     return RecPtr;
    8288             : }
    8289             : 
    8290             : /*
    8291             :  * Check if any of the GUC parameters that are critical for hot standby
    8292             :  * have changed, and update the value in pg_control file if necessary.
    8293             :  */
    8294             : static void
    8295        1728 : XLogReportParameters(void)
    8296             : {
    8297        1728 :     if (wal_level != ControlFile->wal_level ||
    8298        1252 :         wal_log_hints != ControlFile->wal_log_hints ||
    8299        1090 :         MaxConnections != ControlFile->MaxConnections ||
    8300        1088 :         max_worker_processes != ControlFile->max_worker_processes ||
    8301        1086 :         max_wal_senders != ControlFile->max_wal_senders ||
    8302        1040 :         max_prepared_xacts != ControlFile->max_prepared_xacts ||
    8303         860 :         max_locks_per_xact != ControlFile->max_locks_per_xact ||
    8304         860 :         track_commit_timestamp != ControlFile->track_commit_timestamp)
    8305             :     {
    8306             :         /*
    8307             :          * The change in number of backend slots doesn't need to be WAL-logged
    8308             :          * if archiving is not enabled, as you can't start archive recovery
    8309             :          * with wal_level=minimal anyway. We don't really care about the
    8310             :          * values in pg_control either if wal_level=minimal, but seems better
    8311             :          * to keep them up-to-date to avoid confusion.
    8312             :          */
    8313         886 :         if (wal_level != ControlFile->wal_level || XLogIsNeeded())
    8314             :         {
    8315             :             xl_parameter_change xlrec;
    8316             :             XLogRecPtr  recptr;
    8317             : 
    8318         842 :             xlrec.MaxConnections = MaxConnections;
    8319         842 :             xlrec.max_worker_processes = max_worker_processes;
    8320         842 :             xlrec.max_wal_senders = max_wal_senders;
    8321         842 :             xlrec.max_prepared_xacts = max_prepared_xacts;
    8322         842 :             xlrec.max_locks_per_xact = max_locks_per_xact;
    8323         842 :             xlrec.wal_level = wal_level;
    8324         842 :             xlrec.wal_log_hints = wal_log_hints;
    8325         842 :             xlrec.track_commit_timestamp = track_commit_timestamp;
    8326             : 
    8327         842 :             XLogBeginInsert();
    8328         842 :             XLogRegisterData(&xlrec, sizeof(xlrec));
    8329             : 
    8330         842 :             recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
    8331         842 :             XLogFlush(recptr);
    8332             :         }
    8333             : 
    8334         886 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8335             : 
    8336         886 :         ControlFile->MaxConnections = MaxConnections;
    8337         886 :         ControlFile->max_worker_processes = max_worker_processes;
    8338         886 :         ControlFile->max_wal_senders = max_wal_senders;
    8339         886 :         ControlFile->max_prepared_xacts = max_prepared_xacts;
    8340         886 :         ControlFile->max_locks_per_xact = max_locks_per_xact;
    8341         886 :         ControlFile->wal_level = wal_level;
    8342         886 :         ControlFile->wal_log_hints = wal_log_hints;
    8343         886 :         ControlFile->track_commit_timestamp = track_commit_timestamp;
    8344         886 :         UpdateControlFile();
    8345             : 
    8346         886 :         LWLockRelease(ControlFileLock);
    8347             :     }
    8348        1728 : }
    8349             : 
    8350             : /*
    8351             :  * Update full_page_writes in shared memory, and write an
    8352             :  * XLOG_FPW_CHANGE record if necessary.
    8353             :  *
    8354             :  * Note: this function assumes there is no other process running
    8355             :  * concurrently that could update it.
    8356             :  */
    8357             : void
    8358        2870 : UpdateFullPageWrites(void)
    8359             : {
    8360        2870 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    8361             :     bool        recoveryInProgress;
    8362             : 
    8363             :     /*
    8364             :      * Do nothing if full_page_writes has not been changed.
    8365             :      *
    8366             :      * It's safe to check the shared full_page_writes without the lock,
    8367             :      * because we assume that there is no concurrently running process which
    8368             :      * can update it.
    8369             :      */
    8370        2870 :     if (fullPageWrites == Insert->fullPageWrites)
    8371        2172 :         return;
    8372             : 
    8373             :     /*
    8374             :      * Perform this outside critical section so that the WAL insert
    8375             :      * initialization done by RecoveryInProgress() doesn't trigger an
    8376             :      * assertion failure.
    8377             :      */
    8378         698 :     recoveryInProgress = RecoveryInProgress();
    8379             : 
    8380         698 :     START_CRIT_SECTION();
    8381             : 
    8382             :     /*
    8383             :      * It's always safe to take full page images, even when not strictly
    8384             :      * required, but not the other round. So if we're setting full_page_writes
    8385             :      * to true, first set it true and then write the WAL record. If we're
    8386             :      * setting it to false, first write the WAL record and then set the global
    8387             :      * flag.
    8388             :      */
    8389         698 :     if (fullPageWrites)
    8390             :     {
    8391         678 :         WALInsertLockAcquireExclusive();
    8392         678 :         Insert->fullPageWrites = true;
    8393         678 :         WALInsertLockRelease();
    8394             :     }
    8395             : 
    8396             :     /*
    8397             :      * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
    8398             :      * full_page_writes during archive recovery, if required.
    8399             :      */
    8400         698 :     if (XLogStandbyInfoActive() && !recoveryInProgress)
    8401             :     {
    8402           0 :         XLogBeginInsert();
    8403           0 :         XLogRegisterData(&fullPageWrites, sizeof(bool));
    8404             : 
    8405           0 :         XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
    8406             :     }
    8407             : 
    8408         698 :     if (!fullPageWrites)
    8409             :     {
    8410          20 :         WALInsertLockAcquireExclusive();
    8411          20 :         Insert->fullPageWrites = false;
    8412          20 :         WALInsertLockRelease();
    8413             :     }
    8414         698 :     END_CRIT_SECTION();
    8415             : }
    8416             : 
    8417             : /*
    8418             :  * XLOG resource manager's routines
    8419             :  *
    8420             :  * Definitions of info values are in include/catalog/pg_control.h, though
    8421             :  * not all record types are related to control file updates.
    8422             :  *
    8423             :  * NOTE: Some XLOG record types that are directly related to WAL recovery
    8424             :  * are handled in xlogrecovery_redo().
    8425             :  */
    8426             : void
    8427       86586 : xlog_redo(XLogReaderState *record)
    8428             : {
    8429       86586 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    8430       86586 :     XLogRecPtr  lsn = record->EndRecPtr;
    8431             : 
    8432             :     /*
    8433             :      * In XLOG rmgr, backup blocks are only used by XLOG_FPI and
    8434             :      * XLOG_FPI_FOR_HINT records.
    8435             :      */
    8436             :     Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
    8437             :            !XLogRecHasAnyBlockRefs(record));
    8438             : 
    8439       86586 :     if (info == XLOG_NEXTOID)
    8440             :     {
    8441             :         Oid         nextOid;
    8442             : 
    8443             :         /*
    8444             :          * We used to try to take the maximum of TransamVariables->nextOid and
    8445             :          * the recorded nextOid, but that fails if the OID counter wraps
    8446             :          * around.  Since no OID allocation should be happening during replay
    8447             :          * anyway, better to just believe the record exactly.  We still take
    8448             :          * OidGenLock while setting the variable, just in case.
    8449             :          */
    8450         186 :         memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
    8451         186 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    8452         186 :         TransamVariables->nextOid = nextOid;
    8453         186 :         TransamVariables->oidCount = 0;
    8454         186 :         LWLockRelease(OidGenLock);
    8455             :     }
    8456       86400 :     else if (info == XLOG_CHECKPOINT_SHUTDOWN)
    8457             :     {
    8458             :         CheckPoint  checkPoint;
    8459             :         TimeLineID  replayTLI;
    8460             : 
    8461          70 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    8462             :         /* In a SHUTDOWN checkpoint, believe the counters exactly */
    8463          70 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    8464          70 :         TransamVariables->nextXid = checkPoint.nextXid;
    8465          70 :         LWLockRelease(XidGenLock);
    8466          70 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    8467          70 :         TransamVariables->nextOid = checkPoint.nextOid;
    8468          70 :         TransamVariables->oidCount = 0;
    8469          70 :         LWLockRelease(OidGenLock);
    8470          70 :         MultiXactSetNextMXact(checkPoint.nextMulti,
    8471             :                               checkPoint.nextMultiOffset);
    8472             : 
    8473          70 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
    8474             :                                checkPoint.oldestMultiDB);
    8475             : 
    8476             :         /*
    8477             :          * No need to set oldestClogXid here as well; it'll be set when we
    8478             :          * redo an xl_clog_truncate if it changed since initialization.
    8479             :          */
    8480          70 :         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    8481             : 
    8482             :         /*
    8483             :          * If we see a shutdown checkpoint while waiting for an end-of-backup
    8484             :          * record, the backup was canceled and the end-of-backup record will
    8485             :          * never arrive.
    8486             :          */
    8487          70 :         if (ArchiveRecoveryRequested &&
    8488          68 :             !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
    8489           0 :             XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
    8490           0 :             ereport(PANIC,
    8491             :                     (errmsg("online backup was canceled, recovery cannot continue")));
    8492             : 
    8493             :         /*
    8494             :          * If we see a shutdown checkpoint, we know that nothing was running
    8495             :          * on the primary at this point. So fake-up an empty running-xacts
    8496             :          * record and use that here and now. Recover additional standby state
    8497             :          * for prepared transactions.
    8498             :          */
    8499          70 :         if (standbyState >= STANDBY_INITIALIZED)
    8500             :         {
    8501             :             TransactionId *xids;
    8502             :             int         nxids;
    8503             :             TransactionId oldestActiveXID;
    8504             :             TransactionId latestCompletedXid;
    8505             :             RunningTransactionsData running;
    8506             : 
    8507          64 :             oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    8508             : 
    8509             :             /* Update pg_subtrans entries for any prepared transactions */
    8510          64 :             StandbyRecoverPreparedTransactions();
    8511             : 
    8512             :             /*
    8513             :              * Construct a RunningTransactions snapshot representing a shut
    8514             :              * down server, with only prepared transactions still alive. We're
    8515             :              * never overflowed at this point because all subxids are listed
    8516             :              * with their parent prepared transactions.
    8517             :              */
    8518          64 :             running.xcnt = nxids;
    8519          64 :             running.subxcnt = 0;
    8520          64 :             running.subxid_status = SUBXIDS_IN_SUBTRANS;
    8521          64 :             running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
    8522          64 :             running.oldestRunningXid = oldestActiveXID;
    8523          64 :             latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
    8524          64 :             TransactionIdRetreat(latestCompletedXid);
    8525             :             Assert(TransactionIdIsNormal(latestCompletedXid));
    8526          64 :             running.latestCompletedXid = latestCompletedXid;
    8527          64 :             running.xids = xids;
    8528             : 
    8529          64 :             ProcArrayApplyRecoveryInfo(&running);
    8530             :         }
    8531             : 
    8532             :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    8533          70 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8534          70 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    8535          70 :         LWLockRelease(ControlFileLock);
    8536             : 
    8537             :         /* Update shared-memory copy of checkpoint XID/epoch */
    8538          70 :         SpinLockAcquire(&XLogCtl->info_lck);
    8539          70 :         XLogCtl->ckptFullXid = checkPoint.nextXid;
    8540          70 :         SpinLockRelease(&XLogCtl->info_lck);
    8541             : 
    8542             :         /*
    8543             :          * We should've already switched to the new TLI before replaying this
    8544             :          * record.
    8545             :          */
    8546          70 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    8547          70 :         if (checkPoint.ThisTimeLineID != replayTLI)
    8548           0 :             ereport(PANIC,
    8549             :                     (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
    8550             :                             checkPoint.ThisTimeLineID, replayTLI)));
    8551             : 
    8552          70 :         RecoveryRestartPoint(&checkPoint, record);
    8553             :     }
    8554       86330 :     else if (info == XLOG_CHECKPOINT_ONLINE)
    8555             :     {
    8556             :         CheckPoint  checkPoint;
    8557             :         TimeLineID  replayTLI;
    8558             : 
    8559        1344 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    8560             :         /* In an ONLINE checkpoint, treat the XID counter as a minimum */
    8561        1344 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    8562        1344 :         if (FullTransactionIdPrecedes(TransamVariables->nextXid,
    8563             :                                       checkPoint.nextXid))
    8564           0 :             TransamVariables->nextXid = checkPoint.nextXid;
    8565        1344 :         LWLockRelease(XidGenLock);
    8566             : 
    8567             :         /*
    8568             :          * We ignore the nextOid counter in an ONLINE checkpoint, preferring
    8569             :          * to track OID assignment through XLOG_NEXTOID records.  The nextOid
    8570             :          * counter is from the start of the checkpoint and might well be stale
    8571             :          * compared to later XLOG_NEXTOID records.  We could try to take the
    8572             :          * maximum of the nextOid counter and our latest value, but since
    8573             :          * there's no particular guarantee about the speed with which the OID
    8574             :          * counter wraps around, that's a risky thing to do.  In any case,
    8575             :          * users of the nextOid counter are required to avoid assignment of
    8576             :          * duplicates, so that a somewhat out-of-date value should be safe.
    8577             :          */
    8578             : 
    8579             :         /* Handle multixact */
    8580        1344 :         MultiXactAdvanceNextMXact(checkPoint.nextMulti,
    8581             :                                   checkPoint.nextMultiOffset);
    8582             : 
    8583             :         /*
    8584             :          * NB: This may perform multixact truncation when replaying WAL
    8585             :          * generated by an older primary.
    8586             :          */
    8587        1344 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
    8588             :                                checkPoint.oldestMultiDB);
    8589        1344 :         if (TransactionIdPrecedes(TransamVariables->oldestXid,
    8590             :                                   checkPoint.oldestXid))
    8591           0 :             SetTransactionIdLimit(checkPoint.oldestXid,
    8592             :                                   checkPoint.oldestXidDB);
    8593             :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    8594        1344 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8595        1344 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    8596        1344 :         LWLockRelease(ControlFileLock);
    8597             : 
    8598             :         /* Update shared-memory copy of checkpoint XID/epoch */
    8599        1344 :         SpinLockAcquire(&XLogCtl->info_lck);
    8600        1344 :         XLogCtl->ckptFullXid = checkPoint.nextXid;
    8601        1344 :         SpinLockRelease(&XLogCtl->info_lck);
    8602             : 
    8603             :         /* TLI should not change in an on-line checkpoint */
    8604        1344 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    8605        1344 :         if (checkPoint.ThisTimeLineID != replayTLI)
    8606           0 :             ereport(PANIC,
    8607             :                     (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
    8608             :                             checkPoint.ThisTimeLineID, replayTLI)));
    8609             : 
    8610        1344 :         RecoveryRestartPoint(&checkPoint, record);
    8611             :     }
    8612       84986 :     else if (info == XLOG_OVERWRITE_CONTRECORD)
    8613             :     {
    8614             :         /* nothing to do here, handled in xlogrecovery_redo() */
    8615             :     }
    8616       84984 :     else if (info == XLOG_END_OF_RECOVERY)
    8617             :     {
    8618             :         xl_end_of_recovery xlrec;
    8619             :         TimeLineID  replayTLI;
    8620             : 
    8621          20 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
    8622             : 
    8623             :         /*
    8624             :          * For Hot Standby, we could treat this like a Shutdown Checkpoint,
    8625             :          * but this case is rarer and harder to test, so the benefit doesn't
    8626             :          * outweigh the potential extra cost of maintenance.
    8627             :          */
    8628             : 
    8629             :         /*
    8630             :          * We should've already switched to the new TLI before replaying this
    8631             :          * record.
    8632             :          */
    8633          20 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    8634          20 :         if (xlrec.ThisTimeLineID != replayTLI)
    8635           0 :             ereport(PANIC,
    8636             :                     (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
    8637             :                             xlrec.ThisTimeLineID, replayTLI)));
    8638             :     }
    8639       84964 :     else if (info == XLOG_NOOP)
    8640             :     {
    8641             :         /* nothing to do here */
    8642             :     }
    8643       84964 :     else if (info == XLOG_SWITCH)
    8644             :     {
    8645             :         /* nothing to do here */
    8646             :     }
    8647       84080 :     else if (info == XLOG_RESTORE_POINT)
    8648             :     {
    8649             :         /* nothing to do here, handled in xlogrecovery.c */
    8650             :     }
    8651       84070 :     else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
    8652             :     {
    8653             :         /*
    8654             :          * XLOG_FPI records contain nothing else but one or more block
    8655             :          * references. Every block reference must include a full-page image
    8656             :          * even if full_page_writes was disabled when the record was generated
    8657             :          * - otherwise there would be no point in this record.
    8658             :          *
    8659             :          * XLOG_FPI_FOR_HINT records are generated when a page needs to be
    8660             :          * WAL-logged because of a hint bit update. They are only generated
    8661             :          * when checksums and/or wal_log_hints are enabled. They may include
    8662             :          * no full-page images if full_page_writes was disabled when they were
    8663             :          * generated. In this case there is nothing to do here.
    8664             :          *
    8665             :          * No recovery conflicts are generated by these generic records - if a
    8666             :          * resource manager needs to generate conflicts, it has to define a
    8667             :          * separate WAL record type and redo routine.
    8668             :          */
    8669      174590 :         for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
    8670             :         {
    8671             :             Buffer      buffer;
    8672             : 
    8673       92102 :             if (!XLogRecHasBlockImage(record, block_id))
    8674             :             {
    8675         136 :                 if (info == XLOG_FPI)
    8676           0 :                     elog(ERROR, "XLOG_FPI record did not contain a full-page image");
    8677         136 :                 continue;
    8678             :             }
    8679             : 
    8680       91966 :             if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
    8681           0 :                 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
    8682       91966 :             UnlockReleaseBuffer(buffer);
    8683             :         }
    8684             :     }
    8685        1582 :     else if (info == XLOG_BACKUP_END)
    8686             :     {
    8687             :         /* nothing to do here, handled in xlogrecovery_redo() */
    8688             :     }
    8689        1412 :     else if (info == XLOG_PARAMETER_CHANGE)
    8690             :     {
    8691             :         xl_parameter_change xlrec;
    8692             : 
    8693             :         /* Update our copy of the parameters in pg_control */
    8694          66 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
    8695             : 
    8696             :         /*
    8697             :          * Invalidate logical slots if we are in hot standby and the primary
    8698             :          * does not have a WAL level sufficient for logical decoding. No need
    8699             :          * to search for potentially conflicting logically slots if standby is
    8700             :          * running with wal_level lower than logical, because in that case, we
    8701             :          * would have either disallowed creation of logical slots or
    8702             :          * invalidated existing ones.
    8703             :          */
    8704          66 :         if (InRecovery && InHotStandby &&
    8705          36 :             xlrec.wal_level < WAL_LEVEL_LOGICAL &&
    8706          14 :             wal_level >= WAL_LEVEL_LOGICAL)
    8707           8 :             InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_LEVEL,
    8708             :                                                0, InvalidOid,
    8709             :                                                InvalidTransactionId);
    8710             : 
    8711          66 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8712          66 :         ControlFile->MaxConnections = xlrec.MaxConnections;
    8713          66 :         ControlFile->max_worker_processes = xlrec.max_worker_processes;
    8714          66 :         ControlFile->max_wal_senders = xlrec.max_wal_senders;
    8715          66 :         ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
    8716          66 :         ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
    8717          66 :         ControlFile->wal_level = xlrec.wal_level;
    8718          66 :         ControlFile->wal_log_hints = xlrec.wal_log_hints;
    8719             : 
    8720             :         /*
    8721             :          * Update minRecoveryPoint to ensure that if recovery is aborted, we
    8722             :          * recover back up to this point before allowing hot standby again.
    8723             :          * This is important if the max_* settings are decreased, to ensure
    8724             :          * you don't run queries against the WAL preceding the change. The
    8725             :          * local copies cannot be updated as long as crash recovery is
    8726             :          * happening and we expect all the WAL to be replayed.
    8727             :          */
    8728          66 :         if (InArchiveRecovery)
    8729             :         {
    8730          38 :             LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    8731          38 :             LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    8732             :         }
    8733          66 :         if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
    8734             :         {
    8735             :             TimeLineID  replayTLI;
    8736             : 
    8737          16 :             (void) GetCurrentReplayRecPtr(&replayTLI);
    8738          16 :             ControlFile->minRecoveryPoint = lsn;
    8739          16 :             ControlFile->minRecoveryPointTLI = replayTLI;
    8740             :         }
    8741             : 
    8742          66 :         CommitTsParameterChange(xlrec.track_commit_timestamp,
    8743          66 :                                 ControlFile->track_commit_timestamp);
    8744          66 :         ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
    8745             : 
    8746          66 :         UpdateControlFile();
    8747          66 :         LWLockRelease(ControlFileLock);
    8748             : 
    8749             :         /* Check to see if any parameter change gives a problem on recovery */
    8750          66 :         CheckRequiredParameterValues();
    8751             :     }
    8752        1346 :     else if (info == XLOG_FPW_CHANGE)
    8753             :     {
    8754             :         bool        fpw;
    8755             : 
    8756           0 :         memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
    8757             : 
    8758             :         /*
    8759             :          * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
    8760             :          * do_pg_backup_start() and do_pg_backup_stop() can check whether
    8761             :          * full_page_writes has been disabled during online backup.
    8762             :          */
    8763           0 :         if (!fpw)
    8764             :         {
    8765           0 :             SpinLockAcquire(&XLogCtl->info_lck);
    8766           0 :             if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
    8767           0 :                 XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
    8768           0 :             SpinLockRelease(&XLogCtl->info_lck);
    8769             :         }
    8770             : 
    8771             :         /* Keep track of full_page_writes */
    8772           0 :         lastFullPageWrites = fpw;
    8773             :     }
    8774             :     else if (info == XLOG_CHECKPOINT_REDO)
    8775             :     {
    8776             :         /* nothing to do here, just for informational purposes */
    8777             :     }
    8778       86582 : }
    8779             : 
    8780             : /*
    8781             :  * Return the extra open flags used for opening a file, depending on the
    8782             :  * value of the GUCs wal_sync_method, fsync and debug_io_direct.
    8783             :  */
    8784             : static int
    8785       31328 : get_sync_bit(int method)
    8786             : {
    8787       31328 :     int         o_direct_flag = 0;
    8788             : 
    8789             :     /*
    8790             :      * Use O_DIRECT if requested, except in walreceiver process.  The WAL
    8791             :      * written by walreceiver is normally read by the startup process soon
    8792             :      * after it's written.  Also, walreceiver performs unaligned writes, which
    8793             :      * don't work with O_DIRECT, so it is required for correctness too.
    8794             :      */
    8795       31328 :     if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
    8796          18 :         o_direct_flag = PG_O_DIRECT;
    8797             : 
    8798             :     /* If fsync is disabled, never open in sync mode */
    8799       31328 :     if (!enableFsync)
    8800       31328 :         return o_direct_flag;
    8801             : 
    8802           0 :     switch (method)
    8803             :     {
    8804             :             /*
    8805             :              * enum values for all sync options are defined even if they are
    8806             :              * not supported on the current platform.  But if not, they are
    8807             :              * not included in the enum option array, and therefore will never
    8808             :              * be seen here.
    8809             :              */
    8810           0 :         case WAL_SYNC_METHOD_FSYNC:
    8811             :         case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
    8812             :         case WAL_SYNC_METHOD_FDATASYNC:
    8813           0 :             return o_direct_flag;
    8814             : #ifdef O_SYNC
    8815           0 :         case WAL_SYNC_METHOD_OPEN:
    8816           0 :             return O_SYNC | o_direct_flag;
    8817             : #endif
    8818             : #ifdef O_DSYNC
    8819           0 :         case WAL_SYNC_METHOD_OPEN_DSYNC:
    8820           0 :             return O_DSYNC | o_direct_flag;
    8821             : #endif
    8822           0 :         default:
    8823             :             /* can't happen (unless we are out of sync with option array) */
    8824           0 :             elog(ERROR, "unrecognized \"wal_sync_method\": %d", method);
    8825             :             return 0;           /* silence warning */
    8826             :     }
    8827             : }
    8828             : 
    8829             : /*
    8830             :  * GUC support
    8831             :  */
    8832             : void
    8833        2208 : assign_wal_sync_method(int new_wal_sync_method, void *extra)
    8834             : {
    8835        2208 :     if (wal_sync_method != new_wal_sync_method)
    8836             :     {
    8837             :         /*
    8838             :          * To ensure that no blocks escape unsynced, force an fsync on the
    8839             :          * currently open log segment (if any).  Also, if the open flag is
    8840             :          * changing, close the log file so it will be reopened (with new flag
    8841             :          * bit) at next use.
    8842             :          */
    8843           0 :         if (openLogFile >= 0)
    8844             :         {
    8845           0 :             pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
    8846           0 :             if (pg_fsync(openLogFile) != 0)
    8847             :             {
    8848             :                 char        xlogfname[MAXFNAMELEN];
    8849             :                 int         save_errno;
    8850             : 
    8851           0 :                 save_errno = errno;
    8852           0 :                 XLogFileName(xlogfname, openLogTLI, openLogSegNo,
    8853             :                              wal_segment_size);
    8854           0 :                 errno = save_errno;
    8855           0 :                 ereport(PANIC,
    8856             :                         (errcode_for_file_access(),
    8857             :                          errmsg("could not fsync file \"%s\": %m", xlogfname)));
    8858             :             }
    8859             : 
    8860           0 :             pgstat_report_wait_end();
    8861           0 :             if (get_sync_bit(wal_sync_method) != get_sync_bit(new_wal_sync_method))
    8862           0 :                 XLogFileClose();
    8863             :         }
    8864             :     }
    8865        2208 : }
    8866             : 
    8867             : 
    8868             : /*
    8869             :  * Issue appropriate kind of fsync (if any) for an XLOG output file.
    8870             :  *
    8871             :  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
    8872             :  * 'segno' is for error reporting purposes.
    8873             :  */
    8874             : void
    8875      313790 : issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
    8876             : {
    8877      313790 :     char       *msg = NULL;
    8878             :     instr_time  start;
    8879             : 
    8880             :     Assert(tli != 0);
    8881             : 
    8882             :     /*
    8883             :      * Quick exit if fsync is disabled or write() has already synced the WAL
    8884             :      * file.
    8885             :      */
    8886      313790 :     if (!enableFsync ||
    8887           0 :         wal_sync_method == WAL_SYNC_METHOD_OPEN ||
    8888           0 :         wal_sync_method == WAL_SYNC_METHOD_OPEN_DSYNC)
    8889      313790 :         return;
    8890             : 
    8891             :     /*
    8892             :      * Measure I/O timing to sync the WAL file for pg_stat_io.
    8893             :      */
    8894           0 :     start = pgstat_prepare_io_time(track_wal_io_timing);
    8895             : 
    8896           0 :     pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
    8897           0 :     switch (wal_sync_method)
    8898             :     {
    8899           0 :         case WAL_SYNC_METHOD_FSYNC:
    8900           0 :             if (pg_fsync_no_writethrough(fd) != 0)
    8901           0 :                 msg = _("could not fsync file \"%s\": %m");
    8902           0 :             break;
    8903             : #ifdef HAVE_FSYNC_WRITETHROUGH
    8904             :         case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
    8905             :             if (pg_fsync_writethrough(fd) != 0)
    8906             :                 msg = _("could not fsync write-through file \"%s\": %m");
    8907             :             break;
    8908             : #endif
    8909           0 :         case WAL_SYNC_METHOD_FDATASYNC:
    8910           0 :             if (pg_fdatasync(fd) != 0)
    8911           0 :                 msg = _("could not fdatasync file \"%s\": %m");
    8912           0 :             break;
    8913           0 :         case WAL_SYNC_METHOD_OPEN:
    8914             :         case WAL_SYNC_METHOD_OPEN_DSYNC:
    8915             :             /* not reachable */
    8916             :             Assert(false);
    8917           0 :             break;
    8918           0 :         default:
    8919           0 :             ereport(PANIC,
    8920             :                     errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    8921             :                     errmsg_internal("unrecognized \"wal_sync_method\": %d", wal_sync_method));
    8922             :             break;
    8923             :     }
    8924             : 
    8925             :     /* PANIC if failed to fsync */
    8926           0 :     if (msg)
    8927             :     {
    8928             :         char        xlogfname[MAXFNAMELEN];
    8929           0 :         int         save_errno = errno;
    8930             : 
    8931           0 :         XLogFileName(xlogfname, tli, segno, wal_segment_size);
    8932           0 :         errno = save_errno;
    8933           0 :         ereport(PANIC,
    8934             :                 (errcode_for_file_access(),
    8935             :                  errmsg(msg, xlogfname)));
    8936             :     }
    8937             : 
    8938           0 :     pgstat_report_wait_end();
    8939             : 
    8940           0 :     pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC,
    8941             :                             start, 1, 0);
    8942             : }
    8943             : 
    8944             : /*
    8945             :  * do_pg_backup_start is the workhorse of the user-visible pg_backup_start()
    8946             :  * function. It creates the necessary starting checkpoint and constructs the
    8947             :  * backup state and tablespace map.
    8948             :  *
    8949             :  * Input parameters are "state" (the backup state), "fast" (if true, we do
    8950             :  * the checkpoint in immediate mode to make it faster), and "tablespaces"
    8951             :  * (if non-NULL, indicates a list of tablespaceinfo structs describing the
    8952             :  * cluster's tablespaces.).
    8953             :  *
    8954             :  * The tablespace map contents are appended to passed-in parameter
    8955             :  * tablespace_map and the caller is responsible for including it in the backup
    8956             :  * archive as 'tablespace_map'. The tablespace_map file is required mainly for
    8957             :  * tar format in windows as native windows utilities are not able to create
    8958             :  * symlinks while extracting files from tar. However for consistency and
    8959             :  * platform-independence, we do it the same way everywhere.
    8960             :  *
    8961             :  * It fills in "state" with the information required for the backup, such
    8962             :  * as the minimum WAL location that must be present to restore from this
    8963             :  * backup (starttli) and the corresponding timeline ID (starttli).
    8964             :  *
    8965             :  * Every successfully started backup must be stopped by calling
    8966             :  * do_pg_backup_stop() or do_pg_abort_backup(). There can be many
    8967             :  * backups active at the same time.
    8968             :  *
    8969             :  * It is the responsibility of the caller of this function to verify the
    8970             :  * permissions of the calling user!
    8971             :  */
    8972             : void
    8973         322 : do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces,
    8974             :                    BackupState *state, StringInfo tblspcmapfile)
    8975             : {
    8976             :     bool        backup_started_in_recovery;
    8977             : 
    8978             :     Assert(state != NULL);
    8979         322 :     backup_started_in_recovery = RecoveryInProgress();
    8980             : 
    8981             :     /*
    8982             :      * During recovery, we don't need to check WAL level. Because, if WAL
    8983             :      * level is not sufficient, it's impossible to get here during recovery.
    8984             :      */
    8985         322 :     if (!backup_started_in_recovery && !XLogIsNeeded())
    8986           0 :         ereport(ERROR,
    8987             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    8988             :                  errmsg("WAL level not sufficient for making an online backup"),
    8989             :                  errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
    8990             : 
    8991         322 :     if (strlen(backupidstr) > MAXPGPATH)
    8992           2 :         ereport(ERROR,
    8993             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    8994             :                  errmsg("backup label too long (max %d bytes)",
    8995             :                         MAXPGPATH)));
    8996             : 
    8997         320 :     strlcpy(state->name, backupidstr, sizeof(state->name));
    8998             : 
    8999             :     /*
    9000             :      * Mark backup active in shared memory.  We must do full-page WAL writes
    9001             :      * during an on-line backup even if not doing so at other times, because
    9002             :      * it's quite possible for the backup dump to obtain a "torn" (partially
    9003             :      * written) copy of a database page if it reads the page concurrently with
    9004             :      * our write to the same page.  This can be fixed as long as the first
    9005             :      * write to the page in the WAL sequence is a full-page write. Hence, we
    9006             :      * increment runningBackups then force a CHECKPOINT, to ensure there are
    9007             :      * no dirty pages in shared memory that might get dumped while the backup
    9008             :      * is in progress without having a corresponding WAL record.  (Once the
    9009             :      * backup is complete, we need not force full-page writes anymore, since
    9010             :      * we expect that any pages not modified during the backup interval must
    9011             :      * have been correctly captured by the backup.)
    9012             :      *
    9013             :      * Note that forcing full-page writes has no effect during an online
    9014             :      * backup from the standby.
    9015             :      *
    9016             :      * We must hold all the insertion locks to change the value of
    9017             :      * runningBackups, to ensure adequate interlocking against
    9018             :      * XLogInsertRecord().
    9019             :      */
    9020         320 :     WALInsertLockAcquireExclusive();
    9021         320 :     XLogCtl->Insert.runningBackups++;
    9022         320 :     WALInsertLockRelease();
    9023             : 
    9024             :     /*
    9025             :      * Ensure we decrement runningBackups if we fail below. NB -- for this to
    9026             :      * work correctly, it is critical that sessionBackupState is only updated
    9027             :      * after this block is over.
    9028             :      */
    9029         320 :     PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
    9030             :     {
    9031         320 :         bool        gotUniqueStartpoint = false;
    9032             :         DIR        *tblspcdir;
    9033             :         struct dirent *de;
    9034             :         tablespaceinfo *ti;
    9035             :         int         datadirpathlen;
    9036             : 
    9037             :         /*
    9038             :          * Force an XLOG file switch before the checkpoint, to ensure that the
    9039             :          * WAL segment the checkpoint is written to doesn't contain pages with
    9040             :          * old timeline IDs.  That would otherwise happen if you called
    9041             :          * pg_backup_start() right after restoring from a PITR archive: the
    9042             :          * first WAL segment containing the startup checkpoint has pages in
    9043             :          * the beginning with the old timeline ID.  That can cause trouble at
    9044             :          * recovery: we won't have a history file covering the old timeline if
    9045             :          * pg_wal directory was not included in the base backup and the WAL
    9046             :          * archive was cleared too before starting the backup.
    9047             :          *
    9048             :          * This also ensures that we have emitted a WAL page header that has
    9049             :          * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
    9050             :          * Therefore, if a WAL archiver (such as pglesslog) is trying to
    9051             :          * compress out removable backup blocks, it won't remove any that
    9052             :          * occur after this point.
    9053             :          *
    9054             :          * During recovery, we skip forcing XLOG file switch, which means that
    9055             :          * the backup taken during recovery is not available for the special
    9056             :          * recovery case described above.
    9057             :          */
    9058         320 :         if (!backup_started_in_recovery)
    9059         306 :             RequestXLogSwitch(false);
    9060             : 
    9061             :         do
    9062             :         {
    9063             :             bool        checkpointfpw;
    9064             : 
    9065             :             /*
    9066             :              * Force a CHECKPOINT.  Aside from being necessary to prevent torn
    9067             :              * page problems, this guarantees that two successive backup runs
    9068             :              * will have different checkpoint positions and hence different
    9069             :              * history file names, even if nothing happened in between.
    9070             :              *
    9071             :              * During recovery, establish a restartpoint if possible. We use
    9072             :              * the last restartpoint as the backup starting checkpoint. This
    9073             :              * means that two successive backup runs can have same checkpoint
    9074             :              * positions.
    9075             :              *
    9076             :              * Since the fact that we are executing do_pg_backup_start()
    9077             :              * during recovery means that checkpointer is running, we can use
    9078             :              * RequestCheckpoint() to establish a restartpoint.
    9079             :              *
    9080             :              * We use CHECKPOINT_IMMEDIATE only if requested by user (via
    9081             :              * passing fast = true).  Otherwise this can take awhile.
    9082             :              */
    9083         320 :             RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
    9084             :                               (fast ? CHECKPOINT_IMMEDIATE : 0));
    9085             : 
    9086             :             /*
    9087             :              * Now we need to fetch the checkpoint record location, and also
    9088             :              * its REDO pointer.  The oldest point in WAL that would be needed
    9089             :              * to restore starting from the checkpoint is precisely the REDO
    9090             :              * pointer.
    9091             :              */
    9092         320 :             LWLockAcquire(ControlFileLock, LW_SHARED);
    9093         320 :             state->checkpointloc = ControlFile->checkPoint;
    9094         320 :             state->startpoint = ControlFile->checkPointCopy.redo;
    9095         320 :             state->starttli = ControlFile->checkPointCopy.ThisTimeLineID;
    9096         320 :             checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
    9097         320 :             LWLockRelease(ControlFileLock);
    9098             : 
    9099         320 :             if (backup_started_in_recovery)
    9100             :             {
    9101             :                 XLogRecPtr  recptr;
    9102             : 
    9103             :                 /*
    9104             :                  * Check to see if all WAL replayed during online backup
    9105             :                  * (i.e., since last restartpoint used as backup starting
    9106             :                  * checkpoint) contain full-page writes.
    9107             :                  */
    9108          14 :                 SpinLockAcquire(&XLogCtl->info_lck);
    9109          14 :                 recptr = XLogCtl->lastFpwDisableRecPtr;
    9110          14 :                 SpinLockRelease(&XLogCtl->info_lck);
    9111             : 
    9112          14 :                 if (!checkpointfpw || state->startpoint <= recptr)
    9113           0 :                     ereport(ERROR,
    9114             :                             (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9115             :                              errmsg("WAL generated with \"full_page_writes=off\" was replayed "
    9116             :                                     "since last restartpoint"),
    9117             :                              errhint("This means that the backup being taken on the standby "
    9118             :                                      "is corrupt and should not be used. "
    9119             :                                      "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
    9120             :                                      "and then try an online backup again.")));
    9121             : 
    9122             :                 /*
    9123             :                  * During recovery, since we don't use the end-of-backup WAL
    9124             :                  * record and don't write the backup history file, the
    9125             :                  * starting WAL location doesn't need to be unique. This means
    9126             :                  * that two base backups started at the same time might use
    9127             :                  * the same checkpoint as starting locations.
    9128             :                  */
    9129          14 :                 gotUniqueStartpoint = true;
    9130             :             }
    9131             : 
    9132             :             /*
    9133             :              * If two base backups are started at the same time (in WAL sender
    9134             :              * processes), we need to make sure that they use different
    9135             :              * checkpoints as starting locations, because we use the starting
    9136             :              * WAL location as a unique identifier for the base backup in the
    9137             :              * end-of-backup WAL record and when we write the backup history
    9138             :              * file. Perhaps it would be better generate a separate unique ID
    9139             :              * for each backup instead of forcing another checkpoint, but
    9140             :              * taking a checkpoint right after another is not that expensive
    9141             :              * either because only few buffers have been dirtied yet.
    9142             :              */
    9143         320 :             WALInsertLockAcquireExclusive();
    9144         320 :             if (XLogCtl->Insert.lastBackupStart < state->startpoint)
    9145             :             {
    9146         320 :                 XLogCtl->Insert.lastBackupStart = state->startpoint;
    9147         320 :                 gotUniqueStartpoint = true;
    9148             :             }
    9149         320 :             WALInsertLockRelease();
    9150         320 :         } while (!gotUniqueStartpoint);
    9151             : 
    9152             :         /*
    9153             :          * Construct tablespace_map file.
    9154             :          */
    9155         320 :         datadirpathlen = strlen(DataDir);
    9156             : 
    9157             :         /* Collect information about all tablespaces */
    9158         320 :         tblspcdir = AllocateDir(PG_TBLSPC_DIR);
    9159        1032 :         while ((de = ReadDir(tblspcdir, PG_TBLSPC_DIR)) != NULL)
    9160             :         {
    9161             :             char        fullpath[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
    9162             :             char        linkpath[MAXPGPATH];
    9163         712 :             char       *relpath = NULL;
    9164             :             char       *s;
    9165             :             PGFileType  de_type;
    9166             :             char       *badp;
    9167             :             Oid         tsoid;
    9168             : 
    9169             :             /*
    9170             :              * Try to parse the directory name as an unsigned integer.
    9171             :              *
    9172             :              * Tablespace directories should be positive integers that can be
    9173             :              * represented in 32 bits, with no leading zeroes or trailing
    9174             :              * garbage. If we come across a name that doesn't meet those
    9175             :              * criteria, skip it.
    9176             :              */
    9177         712 :             if (de->d_name[0] < '1' || de->d_name[1] > '9')
    9178         640 :                 continue;
    9179          72 :             errno = 0;
    9180          72 :             tsoid = strtoul(de->d_name, &badp, 10);
    9181          72 :             if (*badp != '\0' || errno == EINVAL || errno == ERANGE)
    9182           0 :                 continue;
    9183             : 
    9184          72 :             snprintf(fullpath, sizeof(fullpath), "%s/%s", PG_TBLSPC_DIR, de->d_name);
    9185             : 
    9186          72 :             de_type = get_dirent_type(fullpath, de, false, ERROR);
    9187             : 
    9188          72 :             if (de_type == PGFILETYPE_LNK)
    9189             :             {
    9190             :                 StringInfoData escapedpath;
    9191             :                 int         rllen;
    9192             : 
    9193          44 :                 rllen = readlink(fullpath, linkpath, sizeof(linkpath));
    9194          44 :                 if (rllen < 0)
    9195             :                 {
    9196           0 :                     ereport(WARNING,
    9197             :                             (errmsg("could not read symbolic link \"%s\": %m",
    9198             :                                     fullpath)));
    9199           0 :                     continue;
    9200             :                 }
    9201          44 :                 else if (rllen >= sizeof(linkpath))
    9202             :                 {
    9203           0 :                     ereport(WARNING,
    9204             :                             (errmsg("symbolic link \"%s\" target is too long",
    9205             :                                     fullpath)));
    9206           0 :                     continue;
    9207             :                 }
    9208          44 :                 linkpath[rllen] = '\0';
    9209             : 
    9210             :                 /*
    9211             :                  * Relpath holds the relative path of the tablespace directory
    9212             :                  * when it's located within PGDATA, or NULL if it's located
    9213             :                  * elsewhere.
    9214             :                  */
    9215          44 :                 if (rllen > datadirpathlen &&
    9216           2 :                     strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
    9217           0 :                     IS_DIR_SEP(linkpath[datadirpathlen]))
    9218           0 :                     relpath = pstrdup(linkpath + datadirpathlen + 1);
    9219             : 
    9220             :                 /*
    9221             :                  * Add a backslash-escaped version of the link path to the
    9222             :                  * tablespace map file.
    9223             :                  */
    9224          44 :                 initStringInfo(&escapedpath);
    9225        1092 :                 for (s = linkpath; *s; s++)
    9226             :                 {
    9227        1048 :                     if (*s == '\n' || *s == '\r' || *s == '\\')
    9228           0 :                         appendStringInfoChar(&escapedpath, '\\');
    9229        1048 :                     appendStringInfoChar(&escapedpath, *s);
    9230             :                 }
    9231          44 :                 appendStringInfo(tblspcmapfile, "%s %s\n",
    9232          44 :                                  de->d_name, escapedpath.data);
    9233          44 :                 pfree(escapedpath.data);
    9234             :             }
    9235          28 :             else if (de_type == PGFILETYPE_DIR)
    9236             :             {
    9237             :                 /*
    9238             :                  * It's possible to use allow_in_place_tablespaces to create
    9239             :                  * directories directly under pg_tblspc, for testing purposes
    9240             :                  * only.
    9241             :                  *
    9242             :                  * In this case, we store a relative path rather than an
    9243             :                  * absolute path into the tablespaceinfo.
    9244             :                  */
    9245          28 :                 snprintf(linkpath, sizeof(linkpath), "%s/%s",
    9246          28 :                          PG_TBLSPC_DIR, de->d_name);
    9247          28 :                 relpath = pstrdup(linkpath);
    9248             :             }
    9249             :             else
    9250             :             {
    9251             :                 /* Skip any other file type that appears here. */
    9252           0 :                 continue;
    9253             :             }
    9254             : 
    9255          72 :             ti = palloc(sizeof(tablespaceinfo));
    9256          72 :             ti->oid = tsoid;
    9257          72 :             ti->path = pstrdup(linkpath);
    9258          72 :             ti->rpath = relpath;
    9259          72 :             ti->size = -1;
    9260             : 
    9261          72 :             if (tablespaces)
    9262          72 :                 *tablespaces = lappend(*tablespaces, ti);
    9263             :         }
    9264         320 :         FreeDir(tblspcdir);
    9265             : 
    9266         320 :         state->starttime = (pg_time_t) time(NULL);
    9267             :     }
    9268         320 :     PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
    9269             : 
    9270         320 :     state->started_in_recovery = backup_started_in_recovery;
    9271             : 
    9272             :     /*
    9273             :      * Mark that the start phase has correctly finished for the backup.
    9274             :      */
    9275         320 :     sessionBackupState = SESSION_BACKUP_RUNNING;
    9276         320 : }
    9277             : 
    9278             : /*
    9279             :  * Utility routine to fetch the session-level status of a backup running.
    9280             :  */
    9281             : SessionBackupState
    9282         360 : get_backup_status(void)
    9283             : {
    9284         360 :     return sessionBackupState;
    9285             : }
    9286             : 
    9287             : /*
    9288             :  * do_pg_backup_stop
    9289             :  *
    9290             :  * Utility function called at the end of an online backup.  It creates history
    9291             :  * file (if required), resets sessionBackupState and so on.  It can optionally
    9292             :  * wait for WAL segments to be archived.
    9293             :  *
    9294             :  * "state" is filled with the information necessary to restore from this
    9295             :  * backup with its stop LSN (stoppoint), its timeline ID (stoptli), etc.
    9296             :  *
    9297             :  * It is the responsibility of the caller of this function to verify the
    9298             :  * permissions of the calling user!
    9299             :  */
    9300             : void
    9301         310 : do_pg_backup_stop(BackupState *state, bool waitforarchive)
    9302             : {
    9303         310 :     bool        backup_stopped_in_recovery = false;
    9304             :     char        histfilepath[MAXPGPATH];
    9305             :     char        lastxlogfilename[MAXFNAMELEN];
    9306             :     char        histfilename[MAXFNAMELEN];
    9307             :     XLogSegNo   _logSegNo;
    9308             :     FILE       *fp;
    9309             :     int         seconds_before_warning;
    9310         310 :     int         waits = 0;
    9311         310 :     bool        reported_waiting = false;
    9312             : 
    9313             :     Assert(state != NULL);
    9314             : 
    9315         310 :     backup_stopped_in_recovery = RecoveryInProgress();
    9316             : 
    9317             :     /*
    9318             :      * During recovery, we don't need to check WAL level. Because, if WAL
    9319             :      * level is not sufficient, it's impossible to get here during recovery.
    9320             :      */
    9321         310 :     if (!backup_stopped_in_recovery && !XLogIsNeeded())
    9322           0 :         ereport(ERROR,
    9323             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9324             :                  errmsg("WAL level not sufficient for making an online backup"),
    9325             :                  errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
    9326             : 
    9327             :     /*
    9328             :      * OK to update backup counter and session-level lock.
    9329             :      *
    9330             :      * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them,
    9331             :      * otherwise they can be updated inconsistently, which might cause
    9332             :      * do_pg_abort_backup() to fail.
    9333             :      */
    9334         310 :     WALInsertLockAcquireExclusive();
    9335             : 
    9336             :     /*
    9337             :      * It is expected that each do_pg_backup_start() call is matched by
    9338             :      * exactly one do_pg_backup_stop() call.
    9339             :      */
    9340             :     Assert(XLogCtl->Insert.runningBackups > 0);
    9341         310 :     XLogCtl->Insert.runningBackups--;
    9342             : 
    9343             :     /*
    9344             :      * Clean up session-level lock.
    9345             :      *
    9346             :      * You might think that WALInsertLockRelease() can be called before
    9347             :      * cleaning up session-level lock because session-level lock doesn't need
    9348             :      * to be protected with WAL insertion lock. But since
    9349             :      * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
    9350             :      * cleaned up before it.
    9351             :      */
    9352         310 :     sessionBackupState = SESSION_BACKUP_NONE;
    9353             : 
    9354         310 :     WALInsertLockRelease();
    9355             : 
    9356             :     /*
    9357             :      * If we are taking an online backup from the standby, we confirm that the
    9358             :      * standby has not been promoted during the backup.
    9359             :      */
    9360         310 :     if (state->started_in_recovery && !backup_stopped_in_recovery)
    9361           0 :         ereport(ERROR,
    9362             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9363             :                  errmsg("the standby was promoted during online backup"),
    9364             :                  errhint("This means that the backup being taken is corrupt "
    9365             :                          "and should not be used. "
    9366             :                          "Try taking another online backup.")));
    9367             : 
    9368             :     /*
    9369             :      * During recovery, we don't write an end-of-backup record. We assume that
    9370             :      * pg_control was backed up last and its minimum recovery point can be
    9371             :      * available as the backup end location. Since we don't have an
    9372             :      * end-of-backup record, we use the pg_control value to check whether
    9373             :      * we've reached the end of backup when starting recovery from this
    9374             :      * backup. We have no way of checking if pg_control wasn't backed up last
    9375             :      * however.
    9376             :      *
    9377             :      * We don't force a switch to new WAL file but it is still possible to
    9378             :      * wait for all the required files to be archived if waitforarchive is
    9379             :      * true. This is okay if we use the backup to start a standby and fetch
    9380             :      * the missing WAL using streaming replication. But in the case of an
    9381             :      * archive recovery, a user should set waitforarchive to true and wait for
    9382             :      * them to be archived to ensure that all the required files are
    9383             :      * available.
    9384             :      *
    9385             :      * We return the current minimum recovery point as the backup end
    9386             :      * location. Note that it can be greater than the exact backup end
    9387             :      * location if the minimum recovery point is updated after the backup of
    9388             :      * pg_control. This is harmless for current uses.
    9389             :      *
    9390             :      * XXX currently a backup history file is for informational and debug
    9391             :      * purposes only. It's not essential for an online backup. Furthermore,
    9392             :      * even if it's created, it will not be archived during recovery because
    9393             :      * an archiver is not invoked. So it doesn't seem worthwhile to write a
    9394             :      * backup history file during recovery.
    9395             :      */
    9396         310 :     if (backup_stopped_in_recovery)
    9397             :     {
    9398             :         XLogRecPtr  recptr;
    9399             : 
    9400             :         /*
    9401             :          * Check to see if all WAL replayed during online backup contain
    9402             :          * full-page writes.
    9403             :          */
    9404          14 :         SpinLockAcquire(&XLogCtl->info_lck);
    9405          14 :         recptr = XLogCtl->lastFpwDisableRecPtr;
    9406          14 :         SpinLockRelease(&XLogCtl->info_lck);
    9407             : 
    9408          14 :         if (state->startpoint <= recptr)
    9409           0 :             ereport(ERROR,
    9410             :                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9411             :                      errmsg("WAL generated with \"full_page_writes=off\" was replayed "
    9412             :                             "during online backup"),
    9413             :                      errhint("This means that the backup being taken on the standby "
    9414             :                              "is corrupt and should not be used. "
    9415             :                              "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
    9416             :                              "and then try an online backup again.")));
    9417             : 
    9418             : 
    9419          14 :         LWLockAcquire(ControlFileLock, LW_SHARED);
    9420          14 :         state->stoppoint = ControlFile->minRecoveryPoint;
    9421          14 :         state->stoptli = ControlFile->minRecoveryPointTLI;
    9422          14 :         LWLockRelease(ControlFileLock);
    9423             :     }
    9424             :     else
    9425             :     {
    9426             :         char       *history_file;
    9427             : 
    9428             :         /*
    9429             :          * Write the backup-end xlog record
    9430             :          */
    9431         296 :         XLogBeginInsert();
    9432         296 :         XLogRegisterData(&state->startpoint,
    9433             :                          sizeof(state->startpoint));
    9434         296 :         state->stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
    9435             : 
    9436             :         /*
    9437             :          * Given that we're not in recovery, InsertTimeLineID is set and can't
    9438             :          * change, so we can read it without a lock.
    9439             :          */
    9440         296 :         state->stoptli = XLogCtl->InsertTimeLineID;
    9441             : 
    9442             :         /*
    9443             :          * Force a switch to a new xlog segment file, so that the backup is
    9444             :          * valid as soon as archiver moves out the current segment file.
    9445             :          */
    9446         296 :         RequestXLogSwitch(false);
    9447             : 
    9448         296 :         state->stoptime = (pg_time_t) time(NULL);
    9449             : 
    9450             :         /*
    9451             :          * Write the backup history file
    9452             :          */
    9453         296 :         XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
    9454         296 :         BackupHistoryFilePath(histfilepath, state->stoptli, _logSegNo,
    9455             :                               state->startpoint, wal_segment_size);
    9456         296 :         fp = AllocateFile(histfilepath, "w");
    9457         296 :         if (!fp)
    9458           0 :             ereport(ERROR,
    9459             :                     (errcode_for_file_access(),
    9460             :                      errmsg("could not create file \"%s\": %m",
    9461             :                             histfilepath)));
    9462             : 
    9463             :         /* Build and save the contents of the backup history file */
    9464         296 :         history_file = build_backup_content(state, true);
    9465         296 :         fprintf(fp, "%s", history_file);
    9466         296 :         pfree(history_file);
    9467             : 
    9468         296 :         if (fflush(fp) || ferror(fp) || FreeFile(fp))
    9469           0 :             ereport(ERROR,
    9470             :                     (errcode_for_file_access(),
    9471             :                      errmsg("could not write file \"%s\": %m",
    9472             :                             histfilepath)));
    9473             : 
    9474             :         /*
    9475             :          * Clean out any no-longer-needed history files.  As a side effect,
    9476             :          * this will post a .ready file for the newly created history file,
    9477             :          * notifying the archiver that history file may be archived
    9478             :          * immediately.
    9479             :          */
    9480         296 :         CleanupBackupHistory();
    9481             :     }
    9482             : 
    9483             :     /*
    9484             :      * If archiving is enabled, wait for all the required WAL files to be
    9485             :      * archived before returning. If archiving isn't enabled, the required WAL
    9486             :      * needs to be transported via streaming replication (hopefully with
    9487             :      * wal_keep_size set high enough), or some more exotic mechanism like
    9488             :      * polling and copying files from pg_wal with script. We have no knowledge
    9489             :      * of those mechanisms, so it's up to the user to ensure that he gets all
    9490             :      * the required WAL.
    9491             :      *
    9492             :      * We wait until both the last WAL file filled during backup and the
    9493             :      * history file have been archived, and assume that the alphabetic sorting
    9494             :      * property of the WAL files ensures any earlier WAL files are safely
    9495             :      * archived as well.
    9496             :      *
    9497             :      * We wait forever, since archive_command is supposed to work and we
    9498             :      * assume the admin wanted his backup to work completely. If you don't
    9499             :      * wish to wait, then either waitforarchive should be passed in as false,
    9500             :      * or you can set statement_timeout.  Also, some notices are issued to
    9501             :      * clue in anyone who might be doing this interactively.
    9502             :      */
    9503             : 
    9504         310 :     if (waitforarchive &&
    9505          20 :         ((!backup_stopped_in_recovery && XLogArchivingActive()) ||
    9506           2 :          (backup_stopped_in_recovery && XLogArchivingAlways())))
    9507             :     {
    9508           8 :         XLByteToPrevSeg(state->stoppoint, _logSegNo, wal_segment_size);
    9509           8 :         XLogFileName(lastxlogfilename, state->stoptli, _logSegNo,
    9510             :                      wal_segment_size);
    9511             : 
    9512           8 :         XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
    9513           8 :         BackupHistoryFileName(histfilename, state->stoptli, _logSegNo,
    9514             :                               state->startpoint, wal_segment_size);
    9515             : 
    9516           8 :         seconds_before_warning = 60;
    9517           8 :         waits = 0;
    9518             : 
    9519          24 :         while (XLogArchiveIsBusy(lastxlogfilename) ||
    9520           8 :                XLogArchiveIsBusy(histfilename))
    9521             :         {
    9522           8 :             CHECK_FOR_INTERRUPTS();
    9523             : 
    9524           8 :             if (!reported_waiting && waits > 5)
    9525             :             {
    9526           0 :                 ereport(NOTICE,
    9527             :                         (errmsg("base backup done, waiting for required WAL segments to be archived")));
    9528           0 :                 reported_waiting = true;
    9529             :             }
    9530             : 
    9531           8 :             (void) WaitLatch(MyLatch,
    9532             :                              WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
    9533             :                              1000L,
    9534             :                              WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
    9535           8 :             ResetLatch(MyLatch);
    9536             : 
    9537           8 :             if (++waits >= seconds_before_warning)
    9538             :             {
    9539           0 :                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
    9540           0 :                 ereport(WARNING,
    9541             :                         (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
    9542             :                                 waits),
    9543             :                          errhint("Check that your \"archive_command\" is executing properly.  "
    9544             :                                  "You can safely cancel this backup, "
    9545             :                                  "but the database backup will not be usable without all the WAL segments.")));
    9546             :             }
    9547             :         }
    9548             : 
    9549           8 :         ereport(NOTICE,
    9550             :                 (errmsg("all required WAL segments have been archived")));
    9551             :     }
    9552         302 :     else if (waitforarchive)
    9553          12 :         ereport(NOTICE,
    9554             :                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
    9555         310 : }
    9556             : 
    9557             : 
    9558             : /*
    9559             :  * do_pg_abort_backup: abort a running backup
    9560             :  *
    9561             :  * This does just the most basic steps of do_pg_backup_stop(), by taking the
    9562             :  * system out of backup mode, thus making it a lot more safe to call from
    9563             :  * an error handler.
    9564             :  *
    9565             :  * 'arg' indicates that it's being called during backup setup; so
    9566             :  * sessionBackupState has not been modified yet, but runningBackups has
    9567             :  * already been incremented.  When it's false, then it's invoked as a
    9568             :  * before_shmem_exit handler, and therefore we must not change state
    9569             :  * unless sessionBackupState indicates that a backup is actually running.
    9570             :  *
    9571             :  * NB: This gets used as a PG_ENSURE_ERROR_CLEANUP callback and
    9572             :  * before_shmem_exit handler, hence the odd-looking signature.
    9573             :  */
    9574             : void
    9575          14 : do_pg_abort_backup(int code, Datum arg)
    9576             : {
    9577          14 :     bool        during_backup_start = DatumGetBool(arg);
    9578             : 
    9579             :     /* If called during backup start, there shouldn't be one already running */
    9580             :     Assert(!during_backup_start || sessionBackupState == SESSION_BACKUP_NONE);
    9581             : 
    9582          14 :     if (during_backup_start || sessionBackupState != SESSION_BACKUP_NONE)
    9583             :     {
    9584          10 :         WALInsertLockAcquireExclusive();
    9585             :         Assert(XLogCtl->Insert.runningBackups > 0);
    9586          10 :         XLogCtl->Insert.runningBackups--;
    9587             : 
    9588          10 :         sessionBackupState = SESSION_BACKUP_NONE;
    9589          10 :         WALInsertLockRelease();
    9590             : 
    9591          10 :         if (!during_backup_start)
    9592          10 :             ereport(WARNING,
    9593             :                     errmsg("aborting backup due to backend exiting before pg_backup_stop was called"));
    9594             :     }
    9595          14 : }
    9596             : 
    9597             : /*
    9598             :  * Register a handler that will warn about unterminated backups at end of
    9599             :  * session, unless this has already been done.
    9600             :  */
    9601             : void
    9602           6 : register_persistent_abort_backup_handler(void)
    9603             : {
    9604             :     static bool already_done = false;
    9605             : 
    9606           6 :     if (already_done)
    9607           2 :         return;
    9608           4 :     before_shmem_exit(do_pg_abort_backup, DatumGetBool(false));
    9609           4 :     already_done = true;
    9610             : }
    9611             : 
    9612             : /*
    9613             :  * Get latest WAL insert pointer
    9614             :  */
    9615             : XLogRecPtr
    9616        3956 : GetXLogInsertRecPtr(void)
    9617             : {
    9618        3956 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    9619             :     uint64      current_bytepos;
    9620             : 
    9621        3956 :     SpinLockAcquire(&Insert->insertpos_lck);
    9622        3956 :     current_bytepos = Insert->CurrBytePos;
    9623        3956 :     SpinLockRelease(&Insert->insertpos_lck);
    9624             : 
    9625        3956 :     return XLogBytePosToRecPtr(current_bytepos);
    9626             : }
    9627             : 
    9628             : /*
    9629             :  * Get latest WAL write pointer
    9630             :  */
    9631             : XLogRecPtr
    9632        2900 : GetXLogWriteRecPtr(void)
    9633             : {
    9634        2900 :     RefreshXLogWriteResult(LogwrtResult);
    9635             : 
    9636        2900 :     return LogwrtResult.Write;
    9637             : }
    9638             : 
    9639             : /*
    9640             :  * Returns the redo pointer of the last checkpoint or restartpoint. This is
    9641             :  * the oldest point in WAL that we still need, if we have to restart recovery.
    9642             :  */
    9643             : void
    9644         776 : GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
    9645             : {
    9646         776 :     LWLockAcquire(ControlFileLock, LW_SHARED);
    9647         776 :     *oldrecptr = ControlFile->checkPointCopy.redo;
    9648         776 :     *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
    9649         776 :     LWLockRelease(ControlFileLock);
    9650         776 : }
    9651             : 
    9652             : /* Thin wrapper around ShutdownWalRcv(). */
    9653             : void
    9654        2104 : XLogShutdownWalRcv(void)
    9655             : {
    9656        2104 :     ShutdownWalRcv();
    9657             : 
    9658        2104 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9659        2104 :     XLogCtl->InstallXLogFileSegmentActive = false;
    9660        2104 :     LWLockRelease(ControlFileLock);
    9661        2104 : }
    9662             : 
    9663             : /* Enable WAL file recycling and preallocation. */
    9664             : void
    9665        2218 : SetInstallXLogFileSegmentActive(void)
    9666             : {
    9667        2218 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9668        2218 :     XLogCtl->InstallXLogFileSegmentActive = true;
    9669        2218 :     LWLockRelease(ControlFileLock);
    9670        2218 : }
    9671             : 
    9672             : bool
    9673           0 : IsInstallXLogFileSegmentActive(void)
    9674             : {
    9675             :     bool        result;
    9676             : 
    9677           0 :     LWLockAcquire(ControlFileLock, LW_SHARED);
    9678           0 :     result = XLogCtl->InstallXLogFileSegmentActive;
    9679           0 :     LWLockRelease(ControlFileLock);
    9680             : 
    9681           0 :     return result;
    9682             : }
    9683             : 
    9684             : /*
    9685             :  * Update the WalWriterSleeping flag.
    9686             :  */
    9687             : void
    9688         972 : SetWalWriterSleeping(bool sleeping)
    9689             : {
    9690         972 :     SpinLockAcquire(&XLogCtl->info_lck);
    9691         972 :     XLogCtl->WalWriterSleeping = sleeping;
    9692         972 :     SpinLockRelease(&XLogCtl->info_lck);
    9693         972 : }

Generated by: LCOV version 1.16