LCOV - code coverage report
Current view: top level - src/backend/access/transam - xlog.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 88.8 % 2674 2375
Test Date: 2026-04-07 14:16:30 Functions: 96.4 % 137 132
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * xlog.c
       4              :  *      PostgreSQL write-ahead log manager
       5              :  *
       6              :  * The Write-Ahead Log (WAL) functionality is split into several source
       7              :  * files, in addition to this one:
       8              :  *
       9              :  * xloginsert.c - Functions for constructing WAL records
      10              :  * xlogrecovery.c - WAL recovery and standby code
      11              :  * xlogreader.c - Facility for reading WAL files and parsing WAL records
      12              :  * xlogutils.c - Helper functions for WAL redo routines
      13              :  *
      14              :  * This file contains functions for coordinating database startup and
      15              :  * checkpointing, and managing the write-ahead log buffers when the
      16              :  * system is running.
      17              :  *
      18              :  * StartupXLOG() is the main entry point of the startup process.  It
      19              :  * coordinates database startup, performing WAL recovery, and the
      20              :  * transition from WAL recovery into normal operations.
      21              :  *
      22              :  * XLogInsertRecord() inserts a WAL record into the WAL buffers.  Most
      23              :  * callers should not call this directly, but use the functions in
      24              :  * xloginsert.c to construct the WAL record.  XLogFlush() can be used
      25              :  * to force the WAL to disk.
      26              :  *
      27              :  * In addition to those, there are many other functions for interrogating
      28              :  * the current system state, and for starting/stopping backups.
      29              :  *
      30              :  *
      31              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      32              :  * Portions Copyright (c) 1994, Regents of the University of California
      33              :  *
      34              :  * src/backend/access/transam/xlog.c
      35              :  *
      36              :  *-------------------------------------------------------------------------
      37              :  */
      38              : 
      39              : #include "postgres.h"
      40              : 
      41              : #include <ctype.h>
      42              : #include <math.h>
      43              : #include <time.h>
      44              : #include <fcntl.h>
      45              : #include <sys/stat.h>
      46              : #include <sys/time.h>
      47              : #include <unistd.h>
      48              : 
      49              : #include "access/clog.h"
      50              : #include "access/commit_ts.h"
      51              : #include "access/heaptoast.h"
      52              : #include "access/multixact.h"
      53              : #include "access/rewriteheap.h"
      54              : #include "access/subtrans.h"
      55              : #include "access/timeline.h"
      56              : #include "access/transam.h"
      57              : #include "access/twophase.h"
      58              : #include "access/xact.h"
      59              : #include "access/xlog_internal.h"
      60              : #include "access/xlogarchive.h"
      61              : #include "access/xloginsert.h"
      62              : #include "access/xlogreader.h"
      63              : #include "access/xlogrecovery.h"
      64              : #include "access/xlogutils.h"
      65              : #include "access/xlogwait.h"
      66              : #include "backup/basebackup.h"
      67              : #include "catalog/catversion.h"
      68              : #include "catalog/pg_control.h"
      69              : #include "catalog/pg_database.h"
      70              : #include "common/controldata_utils.h"
      71              : #include "common/file_utils.h"
      72              : #include "executor/instrument.h"
      73              : #include "miscadmin.h"
      74              : #include "pg_trace.h"
      75              : #include "pgstat.h"
      76              : #include "port/atomics.h"
      77              : #include "postmaster/bgwriter.h"
      78              : #include "postmaster/datachecksum_state.h"
      79              : #include "postmaster/startup.h"
      80              : #include "postmaster/walsummarizer.h"
      81              : #include "postmaster/walwriter.h"
      82              : #include "replication/origin.h"
      83              : #include "replication/slot.h"
      84              : #include "replication/slotsync.h"
      85              : #include "replication/snapbuild.h"
      86              : #include "replication/walreceiver.h"
      87              : #include "replication/walsender.h"
      88              : #include "storage/bufmgr.h"
      89              : #include "storage/fd.h"
      90              : #include "storage/ipc.h"
      91              : #include "storage/large_object.h"
      92              : #include "storage/latch.h"
      93              : #include "storage/predicate.h"
      94              : #include "storage/proc.h"
      95              : #include "storage/procarray.h"
      96              : #include "storage/procsignal.h"
      97              : #include "storage/reinit.h"
      98              : #include "storage/spin.h"
      99              : #include "storage/subsystems.h"
     100              : #include "storage/sync.h"
     101              : #include "utils/guc_hooks.h"
     102              : #include "utils/guc_tables.h"
     103              : #include "utils/injection_point.h"
     104              : #include "utils/pgstat_internal.h"
     105              : #include "utils/ps_status.h"
     106              : #include "utils/relmapper.h"
     107              : #include "utils/snapmgr.h"
     108              : #include "utils/timeout.h"
     109              : #include "utils/timestamp.h"
     110              : #include "utils/varlena.h"
     111              : #include "utils/wait_event.h"
     112              : 
     113              : #ifdef WAL_DEBUG
     114              : #include "utils/memutils.h"
     115              : #endif
     116              : 
     117              : /* timeline ID to be used when bootstrapping */
     118              : #define BootstrapTimeLineID     1
     119              : 
     120              : /* User-settable parameters */
     121              : int         max_wal_size_mb = 1024; /* 1 GB */
     122              : int         min_wal_size_mb = 80;   /* 80 MB */
     123              : int         wal_keep_size_mb = 0;
     124              : int         XLOGbuffers = -1;
     125              : int         XLogArchiveTimeout = 0;
     126              : int         XLogArchiveMode = ARCHIVE_MODE_OFF;
     127              : char       *XLogArchiveCommand = NULL;
     128              : bool        EnableHotStandby = false;
     129              : bool        fullPageWrites = true;
     130              : bool        wal_log_hints = false;
     131              : int         wal_compression = WAL_COMPRESSION_NONE;
     132              : char       *wal_consistency_checking_string = NULL;
     133              : bool       *wal_consistency_checking = NULL;
     134              : bool        wal_init_zero = true;
     135              : bool        wal_recycle = true;
     136              : bool        log_checkpoints = true;
     137              : int         wal_sync_method = DEFAULT_WAL_SYNC_METHOD;
     138              : int         wal_level = WAL_LEVEL_REPLICA;
     139              : int         CommitDelay = 0;    /* precommit delay in microseconds */
     140              : int         CommitSiblings = 5; /* # concurrent xacts needed to sleep */
     141              : int         wal_retrieve_retry_interval = 5000;
     142              : int         max_slot_wal_keep_size_mb = -1;
     143              : int         wal_decode_buffer_size = 512 * 1024;
     144              : bool        track_wal_io_timing = false;
     145              : 
     146              : #ifdef WAL_DEBUG
     147              : bool        XLOG_DEBUG = false;
     148              : #endif
     149              : 
     150              : int         wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
     151              : 
     152              : /*
     153              :  * Number of WAL insertion locks to use. A higher value allows more insertions
     154              :  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
     155              :  * which needs to iterate all the locks.
     156              :  */
     157              : #define NUM_XLOGINSERT_LOCKS  8
     158              : 
     159              : /*
     160              :  * Max distance from last checkpoint, before triggering a new xlog-based
     161              :  * checkpoint.
     162              :  */
     163              : int         CheckPointSegments;
     164              : 
     165              : /* Estimated distance between checkpoints, in bytes */
     166              : static double CheckPointDistanceEstimate = 0;
     167              : static double PrevCheckPointDistance = 0;
     168              : 
     169              : /*
     170              :  * Track whether there were any deferred checks for custom resource managers
     171              :  * specified in wal_consistency_checking.
     172              :  */
     173              : static bool check_wal_consistency_checking_deferred = false;
     174              : 
     175              : /*
     176              :  * GUC support
     177              :  */
     178              : const struct config_enum_entry wal_sync_method_options[] = {
     179              :     {"fsync", WAL_SYNC_METHOD_FSYNC, false},
     180              : #ifdef HAVE_FSYNC_WRITETHROUGH
     181              :     {"fsync_writethrough", WAL_SYNC_METHOD_FSYNC_WRITETHROUGH, false},
     182              : #endif
     183              :     {"fdatasync", WAL_SYNC_METHOD_FDATASYNC, false},
     184              : #ifdef O_SYNC
     185              :     {"open_sync", WAL_SYNC_METHOD_OPEN, false},
     186              : #endif
     187              : #ifdef O_DSYNC
     188              :     {"open_datasync", WAL_SYNC_METHOD_OPEN_DSYNC, false},
     189              : #endif
     190              :     {NULL, 0, false}
     191              : };
     192              : 
     193              : 
     194              : /*
     195              :  * Although only "on", "off", and "always" are documented,
     196              :  * we accept all the likely variants of "on" and "off".
     197              :  */
     198              : const struct config_enum_entry archive_mode_options[] = {
     199              :     {"always", ARCHIVE_MODE_ALWAYS, false},
     200              :     {"on", ARCHIVE_MODE_ON, false},
     201              :     {"off", ARCHIVE_MODE_OFF, false},
     202              :     {"true", ARCHIVE_MODE_ON, true},
     203              :     {"false", ARCHIVE_MODE_OFF, true},
     204              :     {"yes", ARCHIVE_MODE_ON, true},
     205              :     {"no", ARCHIVE_MODE_OFF, true},
     206              :     {"1", ARCHIVE_MODE_ON, true},
     207              :     {"0", ARCHIVE_MODE_OFF, true},
     208              :     {NULL, 0, false}
     209              : };
     210              : 
     211              : /*
     212              :  * Statistics for current checkpoint are collected in this global struct.
     213              :  * Because only the checkpointer or a stand-alone backend can perform
     214              :  * checkpoints, this will be unused in normal backends.
     215              :  */
     216              : CheckpointStatsData CheckpointStats;
     217              : 
     218              : /*
     219              :  * During recovery, lastFullPageWrites keeps track of full_page_writes that
     220              :  * the replayed WAL records indicate. It's initialized with full_page_writes
     221              :  * that the recovery starting checkpoint record indicates, and then updated
     222              :  * each time XLOG_FPW_CHANGE record is replayed.
     223              :  */
     224              : static bool lastFullPageWrites;
     225              : 
     226              : /*
     227              :  * Local copy of the state tracked by SharedRecoveryState in shared memory,
     228              :  * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
     229              :  * means "not known, need to check the shared state".
     230              :  */
     231              : static bool LocalRecoveryInProgress = true;
     232              : 
     233              : /*
     234              :  * Local state for XLogInsertAllowed():
     235              :  *      1: unconditionally allowed to insert XLOG
     236              :  *      0: unconditionally not allowed to insert XLOG
     237              :  *      -1: must check RecoveryInProgress(); disallow until it is false
     238              :  * Most processes start with -1 and transition to 1 after seeing that recovery
     239              :  * is not in progress.  But we can also force the value for special cases.
     240              :  * The coding in XLogInsertAllowed() depends on the first two of these states
     241              :  * being numerically the same as bool true and false.
     242              :  */
     243              : static int  LocalXLogInsertAllowed = -1;
     244              : 
     245              : /*
     246              :  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
     247              :  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
     248              :  * end+1 of the last record, and is reset when we end a top-level transaction,
     249              :  * or start a new one; so it can be used to tell if the current transaction has
     250              :  * created any XLOG records.
     251              :  *
     252              :  * While in parallel mode, this may not be fully up to date.  When committing,
     253              :  * a transaction can assume this covers all xlog records written either by the
     254              :  * user backend or by any parallel worker which was present at any point during
     255              :  * the transaction.  But when aborting, or when still in parallel mode, other
     256              :  * parallel backends may have written WAL records at later LSNs than the value
     257              :  * stored here.  The parallel leader advances its own copy, when necessary,
     258              :  * in WaitForParallelWorkersToFinish.
     259              :  */
     260              : XLogRecPtr  ProcLastRecPtr = InvalidXLogRecPtr;
     261              : XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
     262              : XLogRecPtr  XactLastCommitEnd = InvalidXLogRecPtr;
     263              : 
     264              : /*
     265              :  * RedoRecPtr is this backend's local copy of the REDO record pointer
     266              :  * (which is almost but not quite the same as a pointer to the most recent
     267              :  * CHECKPOINT record).  We update this from the shared-memory copy,
     268              :  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
     269              :  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
     270              :  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
     271              :  * see GetRedoRecPtr.
     272              :  *
     273              :  * NB: Code that uses this variable must be prepared not only for the
     274              :  * possibility that it may be arbitrarily out of date, but also for the
     275              :  * possibility that it might be set to InvalidXLogRecPtr. We used to
     276              :  * initialize it as a side effect of the first call to RecoveryInProgress(),
     277              :  * which meant that most code that might use it could assume that it had a
     278              :  * real if perhaps stale value. That's no longer the case.
     279              :  */
     280              : static XLogRecPtr RedoRecPtr;
     281              : 
     282              : /*
     283              :  * doPageWrites is this backend's local copy of (fullPageWrites ||
     284              :  * runningBackups > 0).  It is used together with RedoRecPtr to decide whether
     285              :  * a full-page image of a page need to be taken.
     286              :  *
     287              :  * NB: Initially this is false, and there's no guarantee that it will be
     288              :  * initialized to any other value before it is first used. Any code that
     289              :  * makes use of it must recheck the value after obtaining a WALInsertLock,
     290              :  * and respond appropriately if it turns out that the previous value wasn't
     291              :  * accurate.
     292              :  */
     293              : static bool doPageWrites;
     294              : 
     295              : /*----------
     296              :  * Shared-memory data structures for XLOG control
     297              :  *
     298              :  * LogwrtRqst indicates a byte position that we need to write and/or fsync
     299              :  * the log up to (all records before that point must be written or fsynced).
     300              :  * The positions already written/fsynced are maintained in logWriteResult
     301              :  * and logFlushResult using atomic access.
     302              :  * In addition to the shared variable, each backend has a private copy of
     303              :  * both in LogwrtResult, which is updated when convenient.
     304              :  *
     305              :  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
     306              :  * (protected by info_lck), but we don't need to cache any copies of it.
     307              :  *
     308              :  * info_lck is only held long enough to read/update the protected variables,
     309              :  * so it's a plain spinlock.  The other locks are held longer (potentially
     310              :  * over I/O operations), so we use LWLocks for them.  These locks are:
     311              :  *
     312              :  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
     313              :  * It is only held while initializing and changing the mapping.  If the
     314              :  * contents of the buffer being replaced haven't been written yet, the mapping
     315              :  * lock is released while the write is done, and reacquired afterwards.
     316              :  *
     317              :  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
     318              :  * XLogFlush).
     319              :  *
     320              :  * ControlFileLock: must be held to read/update control file or create
     321              :  * new log file.
     322              :  *
     323              :  *----------
     324              :  */
     325              : 
     326              : typedef struct XLogwrtRqst
     327              : {
     328              :     XLogRecPtr  Write;          /* last byte + 1 to write out */
     329              :     XLogRecPtr  Flush;          /* last byte + 1 to flush */
     330              : } XLogwrtRqst;
     331              : 
     332              : typedef struct XLogwrtResult
     333              : {
     334              :     XLogRecPtr  Write;          /* last byte + 1 written out */
     335              :     XLogRecPtr  Flush;          /* last byte + 1 flushed */
     336              : } XLogwrtResult;
     337              : 
     338              : /*
     339              :  * Inserting to WAL is protected by a small fixed number of WAL insertion
     340              :  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
     341              :  * matter which one. To lock out other concurrent insertions, you must hold
     342              :  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
     343              :  * indicator of how far the insertion has progressed (insertingAt).
     344              :  *
     345              :  * The insertingAt values are read when a process wants to flush WAL from
     346              :  * the in-memory buffers to disk, to check that all the insertions to the
     347              :  * region the process is about to write out have finished. You could simply
     348              :  * wait for all currently in-progress insertions to finish, but the
     349              :  * insertingAt indicator allows you to ignore insertions to later in the WAL,
     350              :  * so that you only wait for the insertions that are modifying the buffers
     351              :  * you're about to write out.
     352              :  *
     353              :  * This isn't just an optimization. If all the WAL buffers are dirty, an
     354              :  * inserter that's holding a WAL insert lock might need to evict an old WAL
     355              :  * buffer, which requires flushing the WAL. If it's possible for an inserter
     356              :  * to block on another inserter unnecessarily, deadlock can arise when two
     357              :  * inserters holding a WAL insert lock wait for each other to finish their
     358              :  * insertion.
     359              :  *
     360              :  * Small WAL records that don't cross a page boundary never update the value,
     361              :  * the WAL record is just copied to the page and the lock is released. But
     362              :  * to avoid the deadlock-scenario explained above, the indicator is always
     363              :  * updated before sleeping while holding an insertion lock.
     364              :  *
     365              :  * lastImportantAt contains the LSN of the last important WAL record inserted
     366              :  * using a given lock. This value is used to detect if there has been
     367              :  * important WAL activity since the last time some action, like a checkpoint,
     368              :  * was performed - allowing to not repeat the action if not. The LSN is
     369              :  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
     370              :  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
     371              :  * records.  Tracking the WAL activity directly in WALInsertLock has the
     372              :  * advantage of not needing any additional locks to update the value.
     373              :  */
     374              : typedef struct
     375              : {
     376              :     LWLock      lock;
     377              :     pg_atomic_uint64 insertingAt;
     378              :     XLogRecPtr  lastImportantAt;
     379              : } WALInsertLock;
     380              : 
     381              : /*
     382              :  * All the WAL insertion locks are allocated as an array in shared memory. We
     383              :  * force the array stride to be a power of 2, which saves a few cycles in
     384              :  * indexing, but more importantly also ensures that individual slots don't
     385              :  * cross cache line boundaries. (Of course, we have to also ensure that the
     386              :  * array start address is suitably aligned.)
     387              :  */
     388              : typedef union WALInsertLockPadded
     389              : {
     390              :     WALInsertLock l;
     391              :     char        pad[PG_CACHE_LINE_SIZE];
     392              : } WALInsertLockPadded;
     393              : 
     394              : /*
     395              :  * Session status of running backup, used for sanity checks in SQL-callable
     396              :  * functions to start and stop backups.
     397              :  */
     398              : static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
     399              : 
     400              : /*
     401              :  * Shared state data for WAL insertion.
     402              :  */
     403              : typedef struct XLogCtlInsert
     404              : {
     405              :     slock_t     insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
     406              : 
     407              :     /*
     408              :      * CurrBytePos is the end of reserved WAL. The next record will be
     409              :      * inserted at that position. PrevBytePos is the start position of the
     410              :      * previously inserted (or rather, reserved) record - it is copied to the
     411              :      * prev-link of the next record. These are stored as "usable byte
     412              :      * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
     413              :      */
     414              :     uint64      CurrBytePos;
     415              :     uint64      PrevBytePos;
     416              : 
     417              :     /*
     418              :      * Make sure the above heavily-contended spinlock and byte positions are
     419              :      * on their own cache line. In particular, the RedoRecPtr and full page
     420              :      * write variables below should be on a different cache line. They are
     421              :      * read on every WAL insertion, but updated rarely, and we don't want
     422              :      * those reads to steal the cache line containing Curr/PrevBytePos.
     423              :      */
     424              :     char        pad[PG_CACHE_LINE_SIZE];
     425              : 
     426              :     /*
     427              :      * fullPageWrites is the authoritative value used by all backends to
     428              :      * determine whether to write full-page image to WAL. This shared value,
     429              :      * instead of the process-local fullPageWrites, is required because, when
     430              :      * full_page_writes is changed by SIGHUP, we must WAL-log it before it
     431              :      * actually affects WAL-logging by backends.  Checkpointer sets at startup
     432              :      * or after SIGHUP.
     433              :      *
     434              :      * To read these fields, you must hold an insertion lock. To modify them,
     435              :      * you must hold ALL the locks.
     436              :      */
     437              :     XLogRecPtr  RedoRecPtr;     /* current redo point for insertions */
     438              :     bool        fullPageWrites;
     439              : 
     440              :     /*
     441              :      * runningBackups is a counter indicating the number of backups currently
     442              :      * in progress. lastBackupStart is the latest checkpoint redo location
     443              :      * used as a starting point for an online backup.
     444              :      */
     445              :     int         runningBackups;
     446              :     XLogRecPtr  lastBackupStart;
     447              : 
     448              :     /*
     449              :      * WAL insertion locks.
     450              :      */
     451              :     WALInsertLockPadded *WALInsertLocks;
     452              : } XLogCtlInsert;
     453              : 
     454              : /*
     455              :  * Total shared-memory state for XLOG.
     456              :  */
     457              : typedef struct XLogCtlData
     458              : {
     459              :     XLogCtlInsert Insert;
     460              : 
     461              :     /* Protected by info_lck: */
     462              :     XLogwrtRqst LogwrtRqst;
     463              :     XLogRecPtr  RedoRecPtr;     /* a recent copy of Insert->RedoRecPtr */
     464              :     XLogRecPtr  asyncXactLSN;   /* LSN of newest async commit/abort */
     465              :     XLogRecPtr  replicationSlotMinLSN;  /* oldest LSN needed by any slot */
     466              : 
     467              :     XLogSegNo   lastRemovedSegNo;   /* latest removed/recycled XLOG segment */
     468              : 
     469              :     /* Fake LSN counter, for unlogged relations. */
     470              :     pg_atomic_uint64 unloggedLSN;
     471              : 
     472              :     /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
     473              :     pg_time_t   lastSegSwitchTime;
     474              :     XLogRecPtr  lastSegSwitchLSN;
     475              : 
     476              :     /* These are accessed using atomics -- info_lck not needed */
     477              :     pg_atomic_uint64 logInsertResult;   /* last byte + 1 inserted to buffers */
     478              :     pg_atomic_uint64 logWriteResult;    /* last byte + 1 written out */
     479              :     pg_atomic_uint64 logFlushResult;    /* last byte + 1 flushed */
     480              : 
     481              :     /*
     482              :      * Latest initialized page in the cache (last byte position + 1).
     483              :      *
     484              :      * To change the identity of a buffer (and InitializedUpTo), you need to
     485              :      * hold WALBufMappingLock.  To change the identity of a buffer that's
     486              :      * still dirty, the old page needs to be written out first, and for that
     487              :      * you need WALWriteLock, and you need to ensure that there are no
     488              :      * in-progress insertions to the page by calling
     489              :      * WaitXLogInsertionsToFinish().
     490              :      */
     491              :     XLogRecPtr  InitializedUpTo;
     492              : 
     493              :     /*
     494              :      * These values do not change after startup, although the pointed-to pages
     495              :      * and xlblocks values certainly do.  xlblocks values are protected by
     496              :      * WALBufMappingLock.
     497              :      */
     498              :     char       *pages;          /* buffers for unwritten XLOG pages */
     499              :     pg_atomic_uint64 *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
     500              :     int         XLogCacheBlck;  /* highest allocated xlog buffer index */
     501              : 
     502              :     /*
     503              :      * InsertTimeLineID is the timeline into which new WAL is being inserted
     504              :      * and flushed. It is zero during recovery, and does not change once set.
     505              :      *
     506              :      * If we create a new timeline when the system was started up,
     507              :      * PrevTimeLineID is the old timeline's ID that we forked off from.
     508              :      * Otherwise it's equal to InsertTimeLineID.
     509              :      *
     510              :      * We set these fields while holding info_lck. Most that reads these
     511              :      * values knows that recovery is no longer in progress and so can safely
     512              :      * read the value without a lock, but code that could be run either during
     513              :      * or after recovery can take info_lck while reading these values.
     514              :      */
     515              :     TimeLineID  InsertTimeLineID;
     516              :     TimeLineID  PrevTimeLineID;
     517              : 
     518              :     /*
     519              :      * SharedRecoveryState indicates if we're still in crash or archive
     520              :      * recovery.  Protected by info_lck.
     521              :      */
     522              :     RecoveryState SharedRecoveryState;
     523              : 
     524              :     /*
     525              :      * InstallXLogFileSegmentActive indicates whether the checkpointer should
     526              :      * arrange for future segments by recycling and/or PreallocXlogFiles().
     527              :      * Protected by ControlFileLock.  Only the startup process changes it.  If
     528              :      * true, anyone can use InstallXLogFileSegment().  If false, the startup
     529              :      * process owns the exclusive right to install segments, by reading from
     530              :      * the archive and possibly replacing existing files.
     531              :      */
     532              :     bool        InstallXLogFileSegmentActive;
     533              : 
     534              :     /*
     535              :      * WalWriterSleeping indicates whether the WAL writer is currently in
     536              :      * low-power mode (and hence should be nudged if an async commit occurs).
     537              :      * Protected by info_lck.
     538              :      */
     539              :     bool        WalWriterSleeping;
     540              : 
     541              :     /*
     542              :      * During recovery, we keep a copy of the latest checkpoint record here.
     543              :      * lastCheckPointRecPtr points to start of checkpoint record and
     544              :      * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
     545              :      * checkpointer when it wants to create a restartpoint.
     546              :      *
     547              :      * Protected by info_lck.
     548              :      */
     549              :     XLogRecPtr  lastCheckPointRecPtr;
     550              :     XLogRecPtr  lastCheckPointEndPtr;
     551              :     CheckPoint  lastCheckPoint;
     552              : 
     553              :     /*
     554              :      * lastFpwDisableRecPtr points to the start of the last replayed
     555              :      * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
     556              :      */
     557              :     XLogRecPtr  lastFpwDisableRecPtr;
     558              : 
     559              :     /* last data_checksum_version we've seen */
     560              :     uint32      data_checksum_version;
     561              : 
     562              :     slock_t     info_lck;       /* locks shared variables shown above */
     563              : } XLogCtlData;
     564              : 
     565              : /*
     566              :  * Classification of XLogInsertRecord operations.
     567              :  */
     568              : typedef enum
     569              : {
     570              :     WALINSERT_NORMAL,
     571              :     WALINSERT_SPECIAL_SWITCH,
     572              :     WALINSERT_SPECIAL_CHECKPOINT
     573              : } WalInsertClass;
     574              : 
     575              : static XLogCtlData *XLogCtl = NULL;
     576              : 
     577              : /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
     578              : static WALInsertLockPadded *WALInsertLocks = NULL;
     579              : 
     580              : /*
     581              :  * We maintain an image of pg_control in shared memory.
     582              :  */
     583              : static ControlFileData *LocalControlFile = NULL;
     584              : static ControlFileData *ControlFile = NULL;
     585              : 
     586              : static void XLOGShmemRequest(void *arg);
     587              : static void XLOGShmemInit(void *arg);
     588              : static void XLOGShmemAttach(void *arg);
     589              : 
     590              : const ShmemCallbacks XLOGShmemCallbacks = {
     591              :     .request_fn = XLOGShmemRequest,
     592              :     .init_fn = XLOGShmemInit,
     593              :     .attach_fn = XLOGShmemAttach,
     594              : };
     595              : 
     596              : /*
     597              :  * Calculate the amount of space left on the page after 'endptr'. Beware
     598              :  * multiple evaluation!
     599              :  */
     600              : #define INSERT_FREESPACE(endptr)    \
     601              :     (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
     602              : 
     603              : /* Macro to advance to next buffer index. */
     604              : #define NextBufIdx(idx)     \
     605              :         (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
     606              : 
     607              : /*
     608              :  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
     609              :  * would hold if it was in cache, the page containing 'recptr'.
     610              :  */
     611              : #define XLogRecPtrToBufIdx(recptr)  \
     612              :     (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
     613              : 
     614              : /*
     615              :  * These are the number of bytes in a WAL page usable for WAL data.
     616              :  */
     617              : #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
     618              : 
     619              : /*
     620              :  * Convert values of GUCs measured in megabytes to equiv. segment count.
     621              :  * Rounds down.
     622              :  */
     623              : #define ConvertToXSegs(x, segsize)  XLogMBVarToSegs((x), (segsize))
     624              : 
     625              : /* The number of bytes in a WAL segment usable for WAL data. */
     626              : static int  UsableBytesInSegment;
     627              : 
     628              : /*
     629              :  * Private, possibly out-of-date copy of shared LogwrtResult.
     630              :  * See discussion above.
     631              :  */
     632              : static XLogwrtResult LogwrtResult = {0, 0};
     633              : 
     634              : /*
     635              :  * Update local copy of shared XLogCtl->log{Write,Flush}Result
     636              :  *
     637              :  * It's critical that Flush always trails Write, so the order of the reads is
     638              :  * important, as is the barrier.  See also XLogWrite.
     639              :  */
     640              : #define RefreshXLogWriteResult(_target) \
     641              :     do { \
     642              :         _target.Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult); \
     643              :         pg_read_barrier(); \
     644              :         _target.Write = pg_atomic_read_u64(&XLogCtl->logWriteResult); \
     645              :     } while (0)
     646              : 
     647              : /*
     648              :  * openLogFile is -1 or a kernel FD for an open log file segment.
     649              :  * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
     650              :  * These variables are only used to write the XLOG, and so will normally refer
     651              :  * to the active segment.
     652              :  *
     653              :  * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
     654              :  */
     655              : static int  openLogFile = -1;
     656              : static XLogSegNo openLogSegNo = 0;
     657              : static TimeLineID openLogTLI = 0;
     658              : 
     659              : /*
     660              :  * Local copies of equivalent fields in the control file.  When running
     661              :  * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
     662              :  * expect to replay all the WAL available, and updateMinRecoveryPoint is
     663              :  * switched to false to prevent any updates while replaying records.
     664              :  * Those values are kept consistent as long as crash recovery runs.
     665              :  */
     666              : static XLogRecPtr LocalMinRecoveryPoint;
     667              : static TimeLineID LocalMinRecoveryPointTLI;
     668              : static bool updateMinRecoveryPoint = true;
     669              : 
     670              : /*
     671              :  * Local state for Controlfile data_checksum_version.  After initialization
     672              :  * this is only updated when absorbing a procsignal barrier during interrupt
     673              :  * processing.  The reason for keeping a copy in backend-private memory is to
     674              :  * avoid locking for interrogating the data checksum state.  Possible values
     675              :  * are the data checksum versions defined in storage/checksum.h.
     676              :  */
     677              : static ChecksumStateType LocalDataChecksumState = 0;
     678              : 
     679              : /*
     680              :  * Variable backing the GUC, keep it in sync with LocalDataChecksumState.
     681              :  * See SetLocalDataChecksumState().
     682              :  */
     683              : int         data_checksums = 0;
     684              : 
     685              : /* For WALInsertLockAcquire/Release functions */
     686              : static int  MyLockNo = 0;
     687              : static bool holdingAllLocks = false;
     688              : 
     689              : #ifdef WAL_DEBUG
     690              : static MemoryContext walDebugCxt = NULL;
     691              : #endif
     692              : 
     693              : static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
     694              :                                         XLogRecPtr EndOfLog,
     695              :                                         TimeLineID newTLI);
     696              : static void CheckRequiredParameterValues(void);
     697              : static void XLogReportParameters(void);
     698              : static int  LocalSetXLogInsertAllowed(void);
     699              : static void CreateEndOfRecoveryRecord(void);
     700              : static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
     701              :                                                   XLogRecPtr pagePtr,
     702              :                                                   TimeLineID newTLI);
     703              : static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
     704              : static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
     705              : 
     706              : static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
     707              :                                   bool opportunistic);
     708              : static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
     709              : static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
     710              :                                    bool find_free, XLogSegNo max_segno,
     711              :                                    TimeLineID tli);
     712              : static void XLogFileClose(void);
     713              : static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
     714              : static void RemoveTempXlogFiles(void);
     715              : static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
     716              :                                XLogRecPtr endptr, TimeLineID insertTLI);
     717              : static void RemoveXlogFile(const struct dirent *segment_de,
     718              :                            XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
     719              :                            TimeLineID insertTLI);
     720              : static void UpdateLastRemovedPtr(char *filename);
     721              : static void ValidateXLOGDirectoryStructure(void);
     722              : static void CleanupBackupHistory(void);
     723              : static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
     724              : static bool PerformRecoveryXLogAction(void);
     725              : static void InitControlFile(uint64 sysidentifier, uint32 data_checksum_version);
     726              : static void WriteControlFile(void);
     727              : static void ReadControlFile(void);
     728              : static void UpdateControlFile(void);
     729              : static char *str_time(pg_time_t tnow, char *buf, size_t bufsize);
     730              : 
     731              : static int  get_sync_bit(int method);
     732              : 
     733              : static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
     734              :                                 XLogRecData *rdata,
     735              :                                 XLogRecPtr StartPos, XLogRecPtr EndPos,
     736              :                                 TimeLineID tli);
     737              : static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
     738              :                                       XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
     739              : static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
     740              :                               XLogRecPtr *PrevPtr);
     741              : static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
     742              : static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
     743              : static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
     744              : static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
     745              : static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
     746              : 
     747              : static void WALInsertLockAcquire(void);
     748              : static void WALInsertLockAcquireExclusive(void);
     749              : static void WALInsertLockRelease(void);
     750              : static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
     751              : 
     752              : static void XLogChecksums(uint32 new_type);
     753              : 
     754              : /*
     755              :  * Insert an XLOG record represented by an already-constructed chain of data
     756              :  * chunks.  This is a low-level routine; to construct the WAL record header
     757              :  * and data, use the higher-level routines in xloginsert.c.
     758              :  *
     759              :  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
     760              :  * WAL record applies to, that were not included in the record as full page
     761              :  * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
     762              :  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
     763              :  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
     764              :  * record is always inserted.
     765              :  *
     766              :  * 'flags' gives more in-depth control on the record being inserted. See
     767              :  * XLogSetRecordFlags() for details.
     768              :  *
     769              :  * 'topxid_included' tells whether the top-transaction id is logged along with
     770              :  * current subtransaction. See XLogRecordAssemble().
     771              :  *
     772              :  * The first XLogRecData in the chain must be for the record header, and its
     773              :  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
     774              :  * xl_crc fields in the header, the rest of the header must already be filled
     775              :  * by the caller.
     776              :  *
     777              :  * Returns XLOG pointer to end of record (beginning of next record).
     778              :  * This can be used as LSN for data pages affected by the logged action.
     779              :  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
     780              :  * before the data page can be written out.  This implements the basic
     781              :  * WAL rule "write the log before the data".)
     782              :  */
     783              : XLogRecPtr
     784     24106330 : XLogInsertRecord(XLogRecData *rdata,
     785              :                  XLogRecPtr fpw_lsn,
     786              :                  uint8 flags,
     787              :                  int num_fpi,
     788              :                  uint64 fpi_bytes,
     789              :                  bool topxid_included)
     790              : {
     791     24106330 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
     792              :     pg_crc32c   rdata_crc;
     793              :     bool        inserted;
     794     24106330 :     XLogRecord *rechdr = (XLogRecord *) rdata->data;
     795     24106330 :     uint8       info = rechdr->xl_info & ~XLR_INFO_MASK;
     796     24106330 :     WalInsertClass class = WALINSERT_NORMAL;
     797              :     XLogRecPtr  StartPos;
     798              :     XLogRecPtr  EndPos;
     799     24106330 :     bool        prevDoPageWrites = doPageWrites;
     800              :     TimeLineID  insertTLI;
     801              : 
     802              :     /* Does this record type require special handling? */
     803     24106330 :     if (unlikely(rechdr->xl_rmid == RM_XLOG_ID))
     804              :     {
     805       437399 :         if (info == XLOG_SWITCH)
     806          814 :             class = WALINSERT_SPECIAL_SWITCH;
     807       436585 :         else if (info == XLOG_CHECKPOINT_REDO)
     808          992 :             class = WALINSERT_SPECIAL_CHECKPOINT;
     809              :     }
     810              : 
     811              :     /* we assume that all of the record header is in the first chunk */
     812              :     Assert(rdata->len >= SizeOfXLogRecord);
     813              : 
     814              :     /* cross-check on whether we should be here or not */
     815     24106330 :     if (!XLogInsertAllowed())
     816            0 :         elog(ERROR, "cannot make new WAL entries during recovery");
     817              : 
     818              :     /*
     819              :      * Given that we're not in recovery, InsertTimeLineID is set and can't
     820              :      * change, so we can read it without a lock.
     821              :      */
     822     24106330 :     insertTLI = XLogCtl->InsertTimeLineID;
     823              : 
     824              :     /*----------
     825              :      *
     826              :      * We have now done all the preparatory work we can without holding a
     827              :      * lock or modifying shared state. From here on, inserting the new WAL
     828              :      * record to the shared WAL buffer cache is a two-step process:
     829              :      *
     830              :      * 1. Reserve the right amount of space from the WAL. The current head of
     831              :      *    reserved space is kept in Insert->CurrBytePos, and is protected by
     832              :      *    insertpos_lck.
     833              :      *
     834              :      * 2. Copy the record to the reserved WAL space. This involves finding the
     835              :      *    correct WAL buffer containing the reserved space, and copying the
     836              :      *    record in place. This can be done concurrently in multiple processes.
     837              :      *
     838              :      * To keep track of which insertions are still in-progress, each concurrent
     839              :      * inserter acquires an insertion lock. In addition to just indicating that
     840              :      * an insertion is in progress, the lock tells others how far the inserter
     841              :      * has progressed. There is a small fixed number of insertion locks,
     842              :      * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
     843              :      * boundary, it updates the value stored in the lock to the how far it has
     844              :      * inserted, to allow the previous buffer to be flushed.
     845              :      *
     846              :      * Holding onto an insertion lock also protects RedoRecPtr and
     847              :      * fullPageWrites from changing until the insertion is finished.
     848              :      *
     849              :      * Step 2 can usually be done completely in parallel. If the required WAL
     850              :      * page is not initialized yet, you have to grab WALBufMappingLock to
     851              :      * initialize it, but the WAL writer tries to do that ahead of insertions
     852              :      * to avoid that from happening in the critical path.
     853              :      *
     854              :      *----------
     855              :      */
     856     24106330 :     START_CRIT_SECTION();
     857              : 
     858     24106330 :     if (likely(class == WALINSERT_NORMAL))
     859              :     {
     860     24104524 :         WALInsertLockAcquire();
     861              : 
     862              :         /*
     863              :          * Check to see if my copy of RedoRecPtr is out of date. If so, may
     864              :          * have to go back and have the caller recompute everything. This can
     865              :          * only happen just after a checkpoint, so it's better to be slow in
     866              :          * this case and fast otherwise.
     867              :          *
     868              :          * Also check to see if fullPageWrites was just turned on or there's a
     869              :          * running backup (which forces full-page writes); if we weren't
     870              :          * already doing full-page writes then go back and recompute.
     871              :          *
     872              :          * If we aren't doing full-page writes then RedoRecPtr doesn't
     873              :          * actually affect the contents of the XLOG record, so we'll update
     874              :          * our local copy but not force a recomputation.  (If doPageWrites was
     875              :          * just turned off, we could recompute the record without full pages,
     876              :          * but we choose not to bother.)
     877              :          */
     878     24104524 :         if (RedoRecPtr != Insert->RedoRecPtr)
     879              :         {
     880              :             Assert(RedoRecPtr < Insert->RedoRecPtr);
     881         8102 :             RedoRecPtr = Insert->RedoRecPtr;
     882              :         }
     883     24104524 :         doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0);
     884              : 
     885     24104524 :         if (doPageWrites &&
     886     21800875 :             (!prevDoPageWrites ||
     887     20268429 :              (XLogRecPtrIsValid(fpw_lsn) && fpw_lsn <= RedoRecPtr)))
     888              :         {
     889              :             /*
     890              :              * Oops, some buffer now needs to be backed up that the caller
     891              :              * didn't back up.  Start over.
     892              :              */
     893         8795 :             WALInsertLockRelease();
     894         8795 :             END_CRIT_SECTION();
     895         8795 :             return InvalidXLogRecPtr;
     896              :         }
     897              : 
     898              :         /*
     899              :          * Reserve space for the record in the WAL. This also sets the xl_prev
     900              :          * pointer.
     901              :          */
     902     24095729 :         ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
     903              :                                   &rechdr->xl_prev);
     904              : 
     905              :         /* Normal records are always inserted. */
     906     24095729 :         inserted = true;
     907              :     }
     908         1806 :     else if (class == WALINSERT_SPECIAL_SWITCH)
     909              :     {
     910              :         /*
     911              :          * In order to insert an XLOG_SWITCH record, we need to hold all of
     912              :          * the WAL insertion locks, not just one, so that no one else can
     913              :          * begin inserting a record until we've figured out how much space
     914              :          * remains in the current WAL segment and claimed all of it.
     915              :          *
     916              :          * Nonetheless, this case is simpler than the normal cases handled
     917              :          * below, which must check for changes in doPageWrites and RedoRecPtr.
     918              :          * Those checks are only needed for records that can contain buffer
     919              :          * references, and an XLOG_SWITCH record never does.
     920              :          */
     921              :         Assert(!XLogRecPtrIsValid(fpw_lsn));
     922          814 :         WALInsertLockAcquireExclusive();
     923          814 :         inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
     924              :     }
     925              :     else
     926              :     {
     927              :         Assert(class == WALINSERT_SPECIAL_CHECKPOINT);
     928              : 
     929              :         /*
     930              :          * We need to update both the local and shared copies of RedoRecPtr,
     931              :          * which means that we need to hold all the WAL insertion locks.
     932              :          * However, there can't be any buffer references, so as above, we need
     933              :          * not check RedoRecPtr before inserting the record; we just need to
     934              :          * update it afterwards.
     935              :          */
     936              :         Assert(!XLogRecPtrIsValid(fpw_lsn));
     937          992 :         WALInsertLockAcquireExclusive();
     938          992 :         ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
     939              :                                   &rechdr->xl_prev);
     940          992 :         RedoRecPtr = Insert->RedoRecPtr = StartPos;
     941          992 :         inserted = true;
     942              :     }
     943              : 
     944     24097535 :     if (inserted)
     945              :     {
     946              :         /*
     947              :          * Now that xl_prev has been filled in, calculate CRC of the record
     948              :          * header.
     949              :          */
     950     24097472 :         rdata_crc = rechdr->xl_crc;
     951     24097472 :         COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
     952     24097472 :         FIN_CRC32C(rdata_crc);
     953     24097472 :         rechdr->xl_crc = rdata_crc;
     954              : 
     955              :         /*
     956              :          * All the record data, including the header, is now ready to be
     957              :          * inserted. Copy the record in the space reserved.
     958              :          */
     959     24097472 :         CopyXLogRecordToWAL(rechdr->xl_tot_len,
     960              :                             class == WALINSERT_SPECIAL_SWITCH, rdata,
     961              :                             StartPos, EndPos, insertTLI);
     962              : 
     963              :         /*
     964              :          * Unless record is flagged as not important, update LSN of last
     965              :          * important record in the current slot. When holding all locks, just
     966              :          * update the first one.
     967              :          */
     968     24097472 :         if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
     969              :         {
     970     23855781 :             int         lockno = holdingAllLocks ? 0 : MyLockNo;
     971              : 
     972     23855781 :             WALInsertLocks[lockno].l.lastImportantAt = StartPos;
     973              :         }
     974              :     }
     975              :     else
     976              :     {
     977              :         /*
     978              :          * This was an xlog-switch record, but the current insert location was
     979              :          * already exactly at the beginning of a segment, so there was no need
     980              :          * to do anything.
     981              :          */
     982              :     }
     983              : 
     984              :     /*
     985              :      * Done! Let others know that we're finished.
     986              :      */
     987     24097535 :     WALInsertLockRelease();
     988              : 
     989     24097535 :     END_CRIT_SECTION();
     990              : 
     991     24097535 :     MarkCurrentTransactionIdLoggedIfAny();
     992              : 
     993              :     /*
     994              :      * Mark top transaction id is logged (if needed) so that we should not try
     995              :      * to log it again with the next WAL record in the current subtransaction.
     996              :      */
     997     24097535 :     if (topxid_included)
     998          223 :         MarkSubxactTopXidLogged();
     999              : 
    1000              :     /*
    1001              :      * Update shared LogwrtRqst.Write, if we crossed page boundary.
    1002              :      */
    1003     24097535 :     if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
    1004              :     {
    1005      1880850 :         SpinLockAcquire(&XLogCtl->info_lck);
    1006              :         /* advance global request to include new block(s) */
    1007      1880850 :         if (XLogCtl->LogwrtRqst.Write < EndPos)
    1008      1816255 :             XLogCtl->LogwrtRqst.Write = EndPos;
    1009      1880850 :         SpinLockRelease(&XLogCtl->info_lck);
    1010      1880850 :         RefreshXLogWriteResult(LogwrtResult);
    1011              :     }
    1012              : 
    1013              :     /*
    1014              :      * If this was an XLOG_SWITCH record, flush the record and the empty
    1015              :      * padding space that fills the rest of the segment, and perform
    1016              :      * end-of-segment actions (eg, notifying archiver).
    1017              :      */
    1018     24097535 :     if (class == WALINSERT_SPECIAL_SWITCH)
    1019              :     {
    1020              :         TRACE_POSTGRESQL_WAL_SWITCH();
    1021          814 :         XLogFlush(EndPos);
    1022              : 
    1023              :         /*
    1024              :          * Even though we reserved the rest of the segment for us, which is
    1025              :          * reflected in EndPos, we return a pointer to just the end of the
    1026              :          * xlog-switch record.
    1027              :          */
    1028          814 :         if (inserted)
    1029              :         {
    1030          751 :             EndPos = StartPos + SizeOfXLogRecord;
    1031          751 :             if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
    1032              :             {
    1033            0 :                 uint64      offset = XLogSegmentOffset(EndPos, wal_segment_size);
    1034              : 
    1035            0 :                 if (offset == EndPos % XLOG_BLCKSZ)
    1036            0 :                     EndPos += SizeOfXLogLongPHD;
    1037              :                 else
    1038            0 :                     EndPos += SizeOfXLogShortPHD;
    1039              :             }
    1040              :         }
    1041              :     }
    1042              : 
    1043              : #ifdef WAL_DEBUG
    1044              :     if (XLOG_DEBUG)
    1045              :     {
    1046              :         static XLogReaderState *debug_reader = NULL;
    1047              :         XLogRecord *record;
    1048              :         DecodedXLogRecord *decoded;
    1049              :         StringInfoData buf;
    1050              :         StringInfoData recordBuf;
    1051              :         char       *errormsg = NULL;
    1052              :         MemoryContext oldCxt;
    1053              : 
    1054              :         oldCxt = MemoryContextSwitchTo(walDebugCxt);
    1055              : 
    1056              :         initStringInfo(&buf);
    1057              :         appendStringInfo(&buf, "INSERT @ %X/%08X: ", LSN_FORMAT_ARGS(EndPos));
    1058              : 
    1059              :         /*
    1060              :          * We have to piece together the WAL record data from the XLogRecData
    1061              :          * entries, so that we can pass it to the rm_desc function as one
    1062              :          * contiguous chunk.
    1063              :          */
    1064              :         initStringInfo(&recordBuf);
    1065              :         for (; rdata != NULL; rdata = rdata->next)
    1066              :             appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
    1067              : 
    1068              :         /* We also need temporary space to decode the record. */
    1069              :         record = (XLogRecord *) recordBuf.data;
    1070              :         decoded = (DecodedXLogRecord *)
    1071              :             palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
    1072              : 
    1073              :         if (!debug_reader)
    1074              :             debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
    1075              :                                               XL_ROUTINE(.page_read = NULL,
    1076              :                                                          .segment_open = NULL,
    1077              :                                                          .segment_close = NULL),
    1078              :                                               NULL);
    1079              :         if (!debug_reader)
    1080              :         {
    1081              :             appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
    1082              :         }
    1083              :         else if (!DecodeXLogRecord(debug_reader,
    1084              :                                    decoded,
    1085              :                                    record,
    1086              :                                    EndPos,
    1087              :                                    &errormsg))
    1088              :         {
    1089              :             appendStringInfo(&buf, "error decoding record: %s",
    1090              :                              errormsg ? errormsg : "no error message");
    1091              :         }
    1092              :         else
    1093              :         {
    1094              :             appendStringInfoString(&buf, " - ");
    1095              : 
    1096              :             debug_reader->record = decoded;
    1097              :             xlog_outdesc(&buf, debug_reader);
    1098              :             debug_reader->record = NULL;
    1099              :         }
    1100              :         elog(LOG, "%s", buf.data);
    1101              : 
    1102              :         pfree(decoded);
    1103              :         pfree(buf.data);
    1104              :         pfree(recordBuf.data);
    1105              :         MemoryContextSwitchTo(oldCxt);
    1106              :     }
    1107              : #endif
    1108              : 
    1109              :     /*
    1110              :      * Update our global variables
    1111              :      */
    1112     24097535 :     ProcLastRecPtr = StartPos;
    1113     24097535 :     XactLastRecEnd = EndPos;
    1114              : 
    1115              :     /* Report WAL traffic to the instrumentation. */
    1116     24097535 :     if (inserted)
    1117              :     {
    1118     24097472 :         pgWalUsage.wal_bytes += rechdr->xl_tot_len;
    1119     24097472 :         pgWalUsage.wal_records++;
    1120     24097472 :         pgWalUsage.wal_fpi += num_fpi;
    1121     24097472 :         pgWalUsage.wal_fpi_bytes += fpi_bytes;
    1122              : 
    1123              :         /* Required for the flush of pending stats WAL data */
    1124     24097472 :         pgstat_report_fixed = true;
    1125              :     }
    1126              : 
    1127     24097535 :     return EndPos;
    1128              : }
    1129              : 
    1130              : /*
    1131              :  * Reserves the right amount of space for a record of given size from the WAL.
    1132              :  * *StartPos is set to the beginning of the reserved section, *EndPos to
    1133              :  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
    1134              :  * used to set the xl_prev of this record.
    1135              :  *
    1136              :  * This is the performance critical part of XLogInsert that must be serialized
    1137              :  * across backends. The rest can happen mostly in parallel. Try to keep this
    1138              :  * section as short as possible, insertpos_lck can be heavily contended on a
    1139              :  * busy system.
    1140              :  *
    1141              :  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
    1142              :  * where we actually copy the record to the reserved space.
    1143              :  *
    1144              :  * NB: Testing shows that XLogInsertRecord runs faster if this code is inlined;
    1145              :  * however, because there are two call sites, the compiler is reluctant to
    1146              :  * inline. We use pg_attribute_always_inline here to try to convince it.
    1147              :  */
    1148              : static pg_attribute_always_inline void
    1149     24096721 : ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
    1150              :                           XLogRecPtr *PrevPtr)
    1151              : {
    1152     24096721 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1153              :     uint64      startbytepos;
    1154              :     uint64      endbytepos;
    1155              :     uint64      prevbytepos;
    1156              : 
    1157     24096721 :     size = MAXALIGN(size);
    1158              : 
    1159              :     /* All (non xlog-switch) records should contain data. */
    1160              :     Assert(size > SizeOfXLogRecord);
    1161              : 
    1162              :     /*
    1163              :      * The duration the spinlock needs to be held is minimized by minimizing
    1164              :      * the calculations that have to be done while holding the lock. The
    1165              :      * current tip of reserved WAL is kept in CurrBytePos, as a byte position
    1166              :      * that only counts "usable" bytes in WAL, that is, it excludes all WAL
    1167              :      * page headers. The mapping between "usable" byte positions and physical
    1168              :      * positions (XLogRecPtrs) can be done outside the locked region, and
    1169              :      * because the usable byte position doesn't include any headers, reserving
    1170              :      * X bytes from WAL is almost as simple as "CurrBytePos += X".
    1171              :      */
    1172     24096721 :     SpinLockAcquire(&Insert->insertpos_lck);
    1173              : 
    1174     24096721 :     startbytepos = Insert->CurrBytePos;
    1175     24096721 :     endbytepos = startbytepos + size;
    1176     24096721 :     prevbytepos = Insert->PrevBytePos;
    1177     24096721 :     Insert->CurrBytePos = endbytepos;
    1178     24096721 :     Insert->PrevBytePos = startbytepos;
    1179              : 
    1180     24096721 :     SpinLockRelease(&Insert->insertpos_lck);
    1181              : 
    1182     24096721 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1183     24096721 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1184     24096721 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1185              : 
    1186              :     /*
    1187              :      * Check that the conversions between "usable byte positions" and
    1188              :      * XLogRecPtrs work consistently in both directions.
    1189              :      */
    1190              :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1191              :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1192              :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1193     24096721 : }
    1194              : 
    1195              : /*
    1196              :  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
    1197              :  *
    1198              :  * A log-switch record is handled slightly differently. The rest of the
    1199              :  * segment will be reserved for this insertion, as indicated by the returned
    1200              :  * *EndPos value. However, if we are already at the beginning of the current
    1201              :  * segment, *StartPos and *EndPos are set to the current location without
    1202              :  * reserving any space, and the function returns false.
    1203              : */
    1204              : static bool
    1205          814 : ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
    1206              : {
    1207          814 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1208              :     uint64      startbytepos;
    1209              :     uint64      endbytepos;
    1210              :     uint64      prevbytepos;
    1211          814 :     uint32      size = MAXALIGN(SizeOfXLogRecord);
    1212              :     XLogRecPtr  ptr;
    1213              :     uint32      segleft;
    1214              : 
    1215              :     /*
    1216              :      * These calculations are a bit heavy-weight to be done while holding a
    1217              :      * spinlock, but since we're holding all the WAL insertion locks, there
    1218              :      * are no other inserters competing for it. GetXLogInsertRecPtr() does
    1219              :      * compete for it, but that's not called very frequently.
    1220              :      */
    1221          814 :     SpinLockAcquire(&Insert->insertpos_lck);
    1222              : 
    1223          814 :     startbytepos = Insert->CurrBytePos;
    1224              : 
    1225          814 :     ptr = XLogBytePosToEndRecPtr(startbytepos);
    1226          814 :     if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
    1227              :     {
    1228           63 :         SpinLockRelease(&Insert->insertpos_lck);
    1229           63 :         *EndPos = *StartPos = ptr;
    1230           63 :         return false;
    1231              :     }
    1232              : 
    1233          751 :     endbytepos = startbytepos + size;
    1234          751 :     prevbytepos = Insert->PrevBytePos;
    1235              : 
    1236          751 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1237          751 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1238              : 
    1239          751 :     segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
    1240          751 :     if (segleft != wal_segment_size)
    1241              :     {
    1242              :         /* consume the rest of the segment */
    1243          751 :         *EndPos += segleft;
    1244          751 :         endbytepos = XLogRecPtrToBytePos(*EndPos);
    1245              :     }
    1246          751 :     Insert->CurrBytePos = endbytepos;
    1247          751 :     Insert->PrevBytePos = startbytepos;
    1248              : 
    1249          751 :     SpinLockRelease(&Insert->insertpos_lck);
    1250              : 
    1251          751 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1252              : 
    1253              :     Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
    1254              :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1255              :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1256              :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1257              : 
    1258          751 :     return true;
    1259              : }
    1260              : 
    1261              : /*
    1262              :  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
    1263              :  * area in the WAL.
    1264              :  */
    1265              : static void
    1266     24097472 : CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
    1267              :                     XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
    1268              : {
    1269              :     char       *currpos;
    1270              :     int         freespace;
    1271              :     int         written;
    1272              :     XLogRecPtr  CurrPos;
    1273              :     XLogPageHeader pagehdr;
    1274              : 
    1275              :     /*
    1276              :      * Get a pointer to the right place in the right WAL buffer to start
    1277              :      * inserting to.
    1278              :      */
    1279     24097472 :     CurrPos = StartPos;
    1280     24097472 :     currpos = GetXLogBuffer(CurrPos, tli);
    1281     24097472 :     freespace = INSERT_FREESPACE(CurrPos);
    1282              : 
    1283              :     /*
    1284              :      * there should be enough space for at least the first field (xl_tot_len)
    1285              :      * on this page.
    1286              :      */
    1287              :     Assert(freespace >= sizeof(uint32));
    1288              : 
    1289              :     /* Copy record data */
    1290     24097472 :     written = 0;
    1291    109619608 :     while (rdata != NULL)
    1292              :     {
    1293     85522136 :         const char *rdata_data = rdata->data;
    1294     85522136 :         int         rdata_len = rdata->len;
    1295              : 
    1296     87524556 :         while (rdata_len > freespace)
    1297              :         {
    1298              :             /*
    1299              :              * Write what fits on this page, and continue on the next page.
    1300              :              */
    1301              :             Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
    1302      2002420 :             memcpy(currpos, rdata_data, freespace);
    1303      2002420 :             rdata_data += freespace;
    1304      2002420 :             rdata_len -= freespace;
    1305      2002420 :             written += freespace;
    1306      2002420 :             CurrPos += freespace;
    1307              : 
    1308              :             /*
    1309              :              * Get pointer to beginning of next page, and set the xlp_rem_len
    1310              :              * in the page header. Set XLP_FIRST_IS_CONTRECORD.
    1311              :              *
    1312              :              * It's safe to set the contrecord flag and xlp_rem_len without a
    1313              :              * lock on the page. All the other flags were already set when the
    1314              :              * page was initialized, in AdvanceXLInsertBuffer, and we're the
    1315              :              * only backend that needs to set the contrecord flag.
    1316              :              */
    1317      2002420 :             currpos = GetXLogBuffer(CurrPos, tli);
    1318      2002420 :             pagehdr = (XLogPageHeader) currpos;
    1319      2002420 :             pagehdr->xlp_rem_len = write_len - written;
    1320      2002420 :             pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
    1321              : 
    1322              :             /* skip over the page header */
    1323      2002420 :             if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
    1324              :             {
    1325         1280 :                 CurrPos += SizeOfXLogLongPHD;
    1326         1280 :                 currpos += SizeOfXLogLongPHD;
    1327              :             }
    1328              :             else
    1329              :             {
    1330      2001140 :                 CurrPos += SizeOfXLogShortPHD;
    1331      2001140 :                 currpos += SizeOfXLogShortPHD;
    1332              :             }
    1333      2002420 :             freespace = INSERT_FREESPACE(CurrPos);
    1334              :         }
    1335              : 
    1336              :         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
    1337     85522136 :         memcpy(currpos, rdata_data, rdata_len);
    1338     85522136 :         currpos += rdata_len;
    1339     85522136 :         CurrPos += rdata_len;
    1340     85522136 :         freespace -= rdata_len;
    1341     85522136 :         written += rdata_len;
    1342              : 
    1343     85522136 :         rdata = rdata->next;
    1344              :     }
    1345              :     Assert(written == write_len);
    1346              : 
    1347              :     /*
    1348              :      * If this was an xlog-switch, it's not enough to write the switch record,
    1349              :      * we also have to consume all the remaining space in the WAL segment.  We
    1350              :      * have already reserved that space, but we need to actually fill it.
    1351              :      */
    1352     24097472 :     if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
    1353              :     {
    1354              :         /* An xlog-switch record doesn't contain any data besides the header */
    1355              :         Assert(write_len == SizeOfXLogRecord);
    1356              : 
    1357              :         /* Assert that we did reserve the right amount of space */
    1358              :         Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
    1359              : 
    1360              :         /* Use up all the remaining space on the current page */
    1361          751 :         CurrPos += freespace;
    1362              : 
    1363              :         /*
    1364              :          * Cause all remaining pages in the segment to be flushed, leaving the
    1365              :          * XLog position where it should be, at the start of the next segment.
    1366              :          * We do this one page at a time, to make sure we don't deadlock
    1367              :          * against ourselves if wal_buffers < wal_segment_size.
    1368              :          */
    1369       747586 :         while (CurrPos < EndPos)
    1370              :         {
    1371              :             /*
    1372              :              * The minimal action to flush the page would be to call
    1373              :              * WALInsertLockUpdateInsertingAt(CurrPos) followed by
    1374              :              * AdvanceXLInsertBuffer(...).  The page would be left initialized
    1375              :              * mostly to zeros, except for the page header (always the short
    1376              :              * variant, as this is never a segment's first page).
    1377              :              *
    1378              :              * The large vistas of zeros are good for compressibility, but the
    1379              :              * headers interrupting them every XLOG_BLCKSZ (with values that
    1380              :              * differ from page to page) are not.  The effect varies with
    1381              :              * compression tool, but bzip2 for instance compresses about an
    1382              :              * order of magnitude worse if those headers are left in place.
    1383              :              *
    1384              :              * Rather than complicating AdvanceXLInsertBuffer itself (which is
    1385              :              * called in heavily-loaded circumstances as well as this lightly-
    1386              :              * loaded one) with variant behavior, we just use GetXLogBuffer
    1387              :              * (which itself calls the two methods we need) to get the pointer
    1388              :              * and zero most of the page.  Then we just zero the page header.
    1389              :              */
    1390       746835 :             currpos = GetXLogBuffer(CurrPos, tli);
    1391      2987340 :             MemSet(currpos, 0, SizeOfXLogShortPHD);
    1392              : 
    1393       746835 :             CurrPos += XLOG_BLCKSZ;
    1394              :         }
    1395              :     }
    1396              :     else
    1397              :     {
    1398              :         /* Align the end position, so that the next record starts aligned */
    1399     24096721 :         CurrPos = MAXALIGN64(CurrPos);
    1400              :     }
    1401              : 
    1402     24097472 :     if (CurrPos != EndPos)
    1403            0 :         ereport(PANIC,
    1404              :                 errcode(ERRCODE_DATA_CORRUPTED),
    1405              :                 errmsg_internal("space reserved for WAL record does not match what was written"));
    1406     24097472 : }
    1407              : 
    1408              : /*
    1409              :  * Acquire a WAL insertion lock, for inserting to WAL.
    1410              :  */
    1411              : static void
    1412     24105526 : WALInsertLockAcquire(void)
    1413              : {
    1414              :     bool        immed;
    1415              : 
    1416              :     /*
    1417              :      * It doesn't matter which of the WAL insertion locks we acquire, so try
    1418              :      * the one we used last time.  If the system isn't particularly busy, it's
    1419              :      * a good bet that it's still available, and it's good to have some
    1420              :      * affinity to a particular lock so that you don't unnecessarily bounce
    1421              :      * cache lines between processes when there's no contention.
    1422              :      *
    1423              :      * If this is the first time through in this backend, pick a lock
    1424              :      * (semi-)randomly.  This allows the locks to be used evenly if you have a
    1425              :      * lot of very short connections.
    1426              :      */
    1427              :     static int  lockToTry = -1;
    1428              : 
    1429     24105526 :     if (lockToTry == -1)
    1430         9259 :         lockToTry = MyProcNumber % NUM_XLOGINSERT_LOCKS;
    1431     24105526 :     MyLockNo = lockToTry;
    1432              : 
    1433              :     /*
    1434              :      * The insertingAt value is initially set to 0, as we don't know our
    1435              :      * insert location yet.
    1436              :      */
    1437     24105526 :     immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
    1438     24105526 :     if (!immed)
    1439              :     {
    1440              :         /*
    1441              :          * If we couldn't get the lock immediately, try another lock next
    1442              :          * time.  On a system with more insertion locks than concurrent
    1443              :          * inserters, this causes all the inserters to eventually migrate to a
    1444              :          * lock that no-one else is using.  On a system with more inserters
    1445              :          * than locks, it still helps to distribute the inserters evenly
    1446              :          * across the locks.
    1447              :          */
    1448        20378 :         lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
    1449              :     }
    1450     24105526 : }
    1451              : 
    1452              : /*
    1453              :  * Acquire all WAL insertion locks, to prevent other backends from inserting
    1454              :  * to WAL.
    1455              :  */
    1456              : static void
    1457         4747 : WALInsertLockAcquireExclusive(void)
    1458              : {
    1459              :     int         i;
    1460              : 
    1461              :     /*
    1462              :      * When holding all the locks, all but the last lock's insertingAt
    1463              :      * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
    1464              :      * XLogRecPtr value, to make sure that no-one blocks waiting on those.
    1465              :      */
    1466        37976 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
    1467              :     {
    1468        33229 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1469        33229 :         LWLockUpdateVar(&WALInsertLocks[i].l.lock,
    1470        33229 :                         &WALInsertLocks[i].l.insertingAt,
    1471              :                         PG_UINT64_MAX);
    1472              :     }
    1473              :     /* Variable value reset to 0 at release */
    1474         4747 :     LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1475              : 
    1476         4747 :     holdingAllLocks = true;
    1477         4747 : }
    1478              : 
    1479              : /*
    1480              :  * Release our insertion lock (or locks, if we're holding them all).
    1481              :  *
    1482              :  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
    1483              :  * next time the lock is acquired.
    1484              :  */
    1485              : static void
    1486     24110273 : WALInsertLockRelease(void)
    1487              : {
    1488     24110273 :     if (holdingAllLocks)
    1489              :     {
    1490              :         int         i;
    1491              : 
    1492        42723 :         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1493        37976 :             LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
    1494        37976 :                                   &WALInsertLocks[i].l.insertingAt,
    1495              :                                   0);
    1496              : 
    1497         4747 :         holdingAllLocks = false;
    1498              :     }
    1499              :     else
    1500              :     {
    1501     24105526 :         LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
    1502     24105526 :                               &WALInsertLocks[MyLockNo].l.insertingAt,
    1503              :                               0);
    1504              :     }
    1505     24110273 : }
    1506              : 
    1507              : /*
    1508              :  * Update our insertingAt value, to let others know that we've finished
    1509              :  * inserting up to that point.
    1510              :  */
    1511              : static void
    1512      2703844 : WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
    1513              : {
    1514      2703844 :     if (holdingAllLocks)
    1515              :     {
    1516              :         /*
    1517              :          * We use the last lock to mark our actual position, see comments in
    1518              :          * WALInsertLockAcquireExclusive.
    1519              :          */
    1520       744573 :         LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
    1521       744573 :                         &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
    1522              :                         insertingAt);
    1523              :     }
    1524              :     else
    1525      1959271 :         LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
    1526      1959271 :                         &WALInsertLocks[MyLockNo].l.insertingAt,
    1527              :                         insertingAt);
    1528      2703844 : }
    1529              : 
    1530              : /*
    1531              :  * Wait for any WAL insertions < upto to finish.
    1532              :  *
    1533              :  * Returns the location of the oldest insertion that is still in-progress.
    1534              :  * Any WAL prior to that point has been fully copied into WAL buffers, and
    1535              :  * can be flushed out to disk. Because this waits for any insertions older
    1536              :  * than 'upto' to finish, the return value is always >= 'upto'.
    1537              :  *
    1538              :  * Note: When you are about to write out WAL, you must call this function
    1539              :  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
    1540              :  * need to wait for an insertion to finish (or at least advance to next
    1541              :  * uninitialized page), and the inserter might need to evict an old WAL buffer
    1542              :  * to make room for a new one, which in turn requires WALWriteLock.
    1543              :  */
    1544              : static XLogRecPtr
    1545      2507031 : WaitXLogInsertionsToFinish(XLogRecPtr upto)
    1546              : {
    1547              :     uint64      bytepos;
    1548              :     XLogRecPtr  inserted;
    1549              :     XLogRecPtr  reservedUpto;
    1550              :     XLogRecPtr  finishedUpto;
    1551      2507031 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1552              :     int         i;
    1553              : 
    1554      2507031 :     if (MyProc == NULL)
    1555            0 :         elog(PANIC, "cannot wait without a PGPROC structure");
    1556              : 
    1557              :     /*
    1558              :      * Check if there's any work to do.  Use a barrier to ensure we get the
    1559              :      * freshest value.
    1560              :      */
    1561      2507031 :     inserted = pg_atomic_read_membarrier_u64(&XLogCtl->logInsertResult);
    1562      2507031 :     if (upto <= inserted)
    1563      1995798 :         return inserted;
    1564              : 
    1565              :     /* Read the current insert position */
    1566       511233 :     SpinLockAcquire(&Insert->insertpos_lck);
    1567       511233 :     bytepos = Insert->CurrBytePos;
    1568       511233 :     SpinLockRelease(&Insert->insertpos_lck);
    1569       511233 :     reservedUpto = XLogBytePosToEndRecPtr(bytepos);
    1570              : 
    1571              :     /*
    1572              :      * No-one should request to flush a piece of WAL that hasn't even been
    1573              :      * reserved yet. However, it can happen if there is a block with a bogus
    1574              :      * LSN on disk, for example. XLogFlush checks for that situation and
    1575              :      * complains, but only after the flush. Here we just assume that to mean
    1576              :      * that all WAL that has been reserved needs to be finished. In this
    1577              :      * corner-case, the return value can be smaller than 'upto' argument.
    1578              :      */
    1579       511233 :     if (upto > reservedUpto)
    1580              :     {
    1581            0 :         ereport(LOG,
    1582              :                 errmsg("request to flush past end of generated WAL; request %X/%08X, current position %X/%08X",
    1583              :                        LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto)));
    1584            0 :         upto = reservedUpto;
    1585              :     }
    1586              : 
    1587              :     /*
    1588              :      * Loop through all the locks, sleeping on any in-progress insert older
    1589              :      * than 'upto'.
    1590              :      *
    1591              :      * finishedUpto is our return value, indicating the point upto which all
    1592              :      * the WAL insertions have been finished. Initialize it to the head of
    1593              :      * reserved WAL, and as we iterate through the insertion locks, back it
    1594              :      * out for any insertion that's still in progress.
    1595              :      */
    1596       511233 :     finishedUpto = reservedUpto;
    1597      4601097 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1598              :     {
    1599      4089864 :         XLogRecPtr  insertingat = InvalidXLogRecPtr;
    1600              : 
    1601              :         do
    1602              :         {
    1603              :             /*
    1604              :              * See if this insertion is in progress.  LWLockWaitForVar will
    1605              :              * wait for the lock to be released, or for the 'value' to be set
    1606              :              * by a LWLockUpdateVar call.  When a lock is initially acquired,
    1607              :              * its value is 0 (InvalidXLogRecPtr), which means that we don't
    1608              :              * know where it's inserting yet.  We will have to wait for it. If
    1609              :              * it's a small insertion, the record will most likely fit on the
    1610              :              * same page and the inserter will release the lock without ever
    1611              :              * calling LWLockUpdateVar.  But if it has to sleep, it will
    1612              :              * advertise the insertion point with LWLockUpdateVar before
    1613              :              * sleeping.
    1614              :              *
    1615              :              * In this loop we are only waiting for insertions that started
    1616              :              * before WaitXLogInsertionsToFinish was called.  The lack of
    1617              :              * memory barriers in the loop means that we might see locks as
    1618              :              * "unused" that have since become used.  This is fine because
    1619              :              * they only can be used for later insertions that we would not
    1620              :              * want to wait on anyway.  Not taking a lock to acquire the
    1621              :              * current insertingAt value means that we might see older
    1622              :              * insertingAt values.  This is also fine, because if we read a
    1623              :              * value too old, we will add ourselves to the wait queue, which
    1624              :              * contains atomic operations.
    1625              :              */
    1626      4196823 :             if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
    1627      4196823 :                                  &WALInsertLocks[i].l.insertingAt,
    1628              :                                  insertingat, &insertingat))
    1629              :             {
    1630              :                 /* the lock was free, so no insertion in progress */
    1631      2954379 :                 insertingat = InvalidXLogRecPtr;
    1632      2954379 :                 break;
    1633              :             }
    1634              : 
    1635              :             /*
    1636              :              * This insertion is still in progress. Have to wait, unless the
    1637              :              * inserter has proceeded past 'upto'.
    1638              :              */
    1639      1242444 :         } while (insertingat < upto);
    1640              : 
    1641      4089864 :         if (XLogRecPtrIsValid(insertingat) && insertingat < finishedUpto)
    1642       413016 :             finishedUpto = insertingat;
    1643              :     }
    1644              : 
    1645              :     /*
    1646              :      * Advance the limit we know to have been inserted and return the freshest
    1647              :      * value we know of, which might be beyond what we requested if somebody
    1648              :      * is concurrently doing this with an 'upto' pointer ahead of us.
    1649              :      */
    1650       511233 :     finishedUpto = pg_atomic_monotonic_advance_u64(&XLogCtl->logInsertResult,
    1651              :                                                    finishedUpto);
    1652              : 
    1653       511233 :     return finishedUpto;
    1654              : }
    1655              : 
    1656              : /*
    1657              :  * Get a pointer to the right location in the WAL buffer containing the
    1658              :  * given XLogRecPtr.
    1659              :  *
    1660              :  * If the page is not initialized yet, it is initialized. That might require
    1661              :  * evicting an old dirty buffer from the buffer cache, which means I/O.
    1662              :  *
    1663              :  * The caller must ensure that the page containing the requested location
    1664              :  * isn't evicted yet, and won't be evicted. The way to ensure that is to
    1665              :  * hold onto a WAL insertion lock with the insertingAt position set to
    1666              :  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
    1667              :  * to evict an old page from the buffer. (This means that once you call
    1668              :  * GetXLogBuffer() with a given 'ptr', you must not access anything before
    1669              :  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
    1670              :  * later, because older buffers might be recycled already)
    1671              :  */
    1672              : static char *
    1673     26846737 : GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
    1674              : {
    1675              :     int         idx;
    1676              :     XLogRecPtr  endptr;
    1677              :     static uint64 cachedPage = 0;
    1678              :     static char *cachedPos = NULL;
    1679              :     XLogRecPtr  expectedEndPtr;
    1680              : 
    1681              :     /*
    1682              :      * Fast path for the common case that we need to access again the same
    1683              :      * page as last time.
    1684              :      */
    1685     26846737 :     if (ptr / XLOG_BLCKSZ == cachedPage)
    1686              :     {
    1687              :         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1688              :         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1689     23639342 :         return cachedPos + ptr % XLOG_BLCKSZ;
    1690              :     }
    1691              : 
    1692              :     /*
    1693              :      * The XLog buffer cache is organized so that a page is always loaded to a
    1694              :      * particular buffer.  That way we can easily calculate the buffer a given
    1695              :      * page must be loaded into, from the XLogRecPtr alone.
    1696              :      */
    1697      3207395 :     idx = XLogRecPtrToBufIdx(ptr);
    1698              : 
    1699              :     /*
    1700              :      * See what page is loaded in the buffer at the moment. It could be the
    1701              :      * page we're looking for, or something older. It can't be anything newer
    1702              :      * - that would imply the page we're looking for has already been written
    1703              :      * out to disk and evicted, and the caller is responsible for making sure
    1704              :      * that doesn't happen.
    1705              :      *
    1706              :      * We don't hold a lock while we read the value. If someone is just about
    1707              :      * to initialize or has just initialized the page, it's possible that we
    1708              :      * get InvalidXLogRecPtr. That's ok, we'll grab the mapping lock (in
    1709              :      * AdvanceXLInsertBuffer) and retry if we see anything other than the page
    1710              :      * we're looking for.
    1711              :      */
    1712      3207395 :     expectedEndPtr = ptr;
    1713      3207395 :     expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
    1714              : 
    1715      3207395 :     endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1716      3207395 :     if (expectedEndPtr != endptr)
    1717              :     {
    1718              :         XLogRecPtr  initializedUpto;
    1719              : 
    1720              :         /*
    1721              :          * Before calling AdvanceXLInsertBuffer(), which can block, let others
    1722              :          * know how far we're finished with inserting the record.
    1723              :          *
    1724              :          * NB: If 'ptr' points to just after the page header, advertise a
    1725              :          * position at the beginning of the page rather than 'ptr' itself. If
    1726              :          * there are no other insertions running, someone might try to flush
    1727              :          * up to our advertised location. If we advertised a position after
    1728              :          * the page header, someone might try to flush the page header, even
    1729              :          * though page might actually not be initialized yet. As the first
    1730              :          * inserter on the page, we are effectively responsible for making
    1731              :          * sure that it's initialized, before we let insertingAt to move past
    1732              :          * the page header.
    1733              :          */
    1734      2703844 :         if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
    1735        12215 :             XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
    1736        12215 :             initializedUpto = ptr - SizeOfXLogShortPHD;
    1737      2691629 :         else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
    1738         1059 :                  XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
    1739          626 :             initializedUpto = ptr - SizeOfXLogLongPHD;
    1740              :         else
    1741      2691003 :             initializedUpto = ptr;
    1742              : 
    1743      2703844 :         WALInsertLockUpdateInsertingAt(initializedUpto);
    1744              : 
    1745      2703844 :         AdvanceXLInsertBuffer(ptr, tli, false);
    1746      2703844 :         endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1747              : 
    1748      2703844 :         if (expectedEndPtr != endptr)
    1749            0 :             elog(PANIC, "could not find WAL buffer for %X/%08X",
    1750              :                  LSN_FORMAT_ARGS(ptr));
    1751              :     }
    1752              :     else
    1753              :     {
    1754              :         /*
    1755              :          * Make sure the initialization of the page is visible to us, and
    1756              :          * won't arrive later to overwrite the WAL data we write on the page.
    1757              :          */
    1758       503551 :         pg_memory_barrier();
    1759              :     }
    1760              : 
    1761              :     /*
    1762              :      * Found the buffer holding this page. Return a pointer to the right
    1763              :      * offset within the page.
    1764              :      */
    1765      3207395 :     cachedPage = ptr / XLOG_BLCKSZ;
    1766      3207395 :     cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
    1767              : 
    1768              :     Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1769              :     Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1770              : 
    1771      3207395 :     return cachedPos + ptr % XLOG_BLCKSZ;
    1772              : }
    1773              : 
    1774              : /*
    1775              :  * Read WAL data directly from WAL buffers, if available. Returns the number
    1776              :  * of bytes read successfully.
    1777              :  *
    1778              :  * Fewer than 'count' bytes may be read if some of the requested WAL data has
    1779              :  * already been evicted.
    1780              :  *
    1781              :  * No locks are taken.
    1782              :  *
    1783              :  * Caller should ensure that it reads no further than LogwrtResult.Write
    1784              :  * (which should have been updated by the caller when determining how far to
    1785              :  * read). The 'tli' argument is only used as a convenient safety check so that
    1786              :  * callers do not read from WAL buffers on a historical timeline.
    1787              :  */
    1788              : Size
    1789       106594 : WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count,
    1790              :                    TimeLineID tli)
    1791              : {
    1792       106594 :     char       *pdst = dstbuf;
    1793       106594 :     XLogRecPtr  recptr = startptr;
    1794              :     XLogRecPtr  inserted;
    1795       106594 :     Size        nbytes = count;
    1796              : 
    1797       106594 :     if (RecoveryInProgress() || tli != GetWALInsertionTimeLine())
    1798          913 :         return 0;
    1799              : 
    1800              :     Assert(XLogRecPtrIsValid(startptr));
    1801              : 
    1802              :     /*
    1803              :      * Caller should ensure that the requested data has been inserted into WAL
    1804              :      * buffers before we try to read it.
    1805              :      */
    1806       105681 :     inserted = pg_atomic_read_u64(&XLogCtl->logInsertResult);
    1807       105681 :     if (startptr + count > inserted)
    1808            0 :         ereport(ERROR,
    1809              :                 errmsg("cannot read past end of generated WAL: requested %X/%08X, current position %X/%08X",
    1810              :                        LSN_FORMAT_ARGS(startptr + count),
    1811              :                        LSN_FORMAT_ARGS(inserted)));
    1812              : 
    1813              :     /*
    1814              :      * Loop through the buffers without a lock. For each buffer, atomically
    1815              :      * read and verify the end pointer, then copy the data out, and finally
    1816              :      * re-read and re-verify the end pointer.
    1817              :      *
    1818              :      * Once a page is evicted, it never returns to the WAL buffers, so if the
    1819              :      * end pointer matches the expected end pointer before and after we copy
    1820              :      * the data, then the right page must have been present during the data
    1821              :      * copy. Read barriers are necessary to ensure that the data copy actually
    1822              :      * happens between the two verification steps.
    1823              :      *
    1824              :      * If either verification fails, we simply terminate the loop and return
    1825              :      * with the data that had been already copied out successfully.
    1826              :      */
    1827       135589 :     while (nbytes > 0)
    1828              :     {
    1829       127004 :         uint32      offset = recptr % XLOG_BLCKSZ;
    1830       127004 :         int         idx = XLogRecPtrToBufIdx(recptr);
    1831              :         XLogRecPtr  expectedEndPtr;
    1832              :         XLogRecPtr  endptr;
    1833              :         const char *page;
    1834              :         const char *psrc;
    1835              :         Size        npagebytes;
    1836              : 
    1837              :         /*
    1838              :          * Calculate the end pointer we expect in the xlblocks array if the
    1839              :          * correct page is present.
    1840              :          */
    1841       127004 :         expectedEndPtr = recptr + (XLOG_BLCKSZ - offset);
    1842              : 
    1843              :         /*
    1844              :          * First verification step: check that the correct page is present in
    1845              :          * the WAL buffers.
    1846              :          */
    1847       127004 :         endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1848       127004 :         if (expectedEndPtr != endptr)
    1849        97092 :             break;
    1850              : 
    1851              :         /*
    1852              :          * The correct page is present (or was at the time the endptr was
    1853              :          * read; must re-verify later). Calculate pointer to source data and
    1854              :          * determine how much data to read from this page.
    1855              :          */
    1856        29912 :         page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
    1857        29912 :         psrc = page + offset;
    1858        29912 :         npagebytes = Min(nbytes, XLOG_BLCKSZ - offset);
    1859              : 
    1860              :         /*
    1861              :          * Ensure that the data copy and the first verification step are not
    1862              :          * reordered.
    1863              :          */
    1864        29912 :         pg_read_barrier();
    1865              : 
    1866              :         /* data copy */
    1867        29912 :         memcpy(pdst, psrc, npagebytes);
    1868              : 
    1869              :         /*
    1870              :          * Ensure that the data copy and the second verification step are not
    1871              :          * reordered.
    1872              :          */
    1873        29912 :         pg_read_barrier();
    1874              : 
    1875              :         /*
    1876              :          * Second verification step: check that the page we read from wasn't
    1877              :          * evicted while we were copying the data.
    1878              :          */
    1879        29912 :         endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1880        29912 :         if (expectedEndPtr != endptr)
    1881            4 :             break;
    1882              : 
    1883        29908 :         pdst += npagebytes;
    1884        29908 :         recptr += npagebytes;
    1885        29908 :         nbytes -= npagebytes;
    1886              :     }
    1887              : 
    1888              :     Assert(pdst - dstbuf <= count);
    1889              : 
    1890       105681 :     return pdst - dstbuf;
    1891              : }
    1892              : 
    1893              : /*
    1894              :  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
    1895              :  * is the position starting from the beginning of WAL, excluding all WAL
    1896              :  * page headers.
    1897              :  */
    1898              : static XLogRecPtr
    1899     48197808 : XLogBytePosToRecPtr(uint64 bytepos)
    1900              : {
    1901              :     uint64      fullsegs;
    1902              :     uint64      fullpages;
    1903              :     uint64      bytesleft;
    1904              :     uint32      seg_offset;
    1905              :     XLogRecPtr  result;
    1906              : 
    1907     48197808 :     fullsegs = bytepos / UsableBytesInSegment;
    1908     48197808 :     bytesleft = bytepos % UsableBytesInSegment;
    1909              : 
    1910     48197808 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    1911              :     {
    1912              :         /* fits on first page of segment */
    1913        69180 :         seg_offset = bytesleft + SizeOfXLogLongPHD;
    1914              :     }
    1915              :     else
    1916              :     {
    1917              :         /* account for the first page on segment with long header */
    1918     48128628 :         seg_offset = XLOG_BLCKSZ;
    1919     48128628 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    1920              : 
    1921     48128628 :         fullpages = bytesleft / UsableBytesInPage;
    1922     48128628 :         bytesleft = bytesleft % UsableBytesInPage;
    1923              : 
    1924     48128628 :         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    1925              :     }
    1926              : 
    1927     48197808 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
    1928              : 
    1929     48197808 :     return result;
    1930              : }
    1931              : 
    1932              : /*
    1933              :  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
    1934              :  * returns a pointer to the beginning of the page (ie. before page header),
    1935              :  * not to where the first xlog record on that page would go to. This is used
    1936              :  * when converting a pointer to the end of a record.
    1937              :  */
    1938              : static XLogRecPtr
    1939     24726693 : XLogBytePosToEndRecPtr(uint64 bytepos)
    1940              : {
    1941              :     uint64      fullsegs;
    1942              :     uint64      fullpages;
    1943              :     uint64      bytesleft;
    1944              :     uint32      seg_offset;
    1945              :     XLogRecPtr  result;
    1946              : 
    1947     24726693 :     fullsegs = bytepos / UsableBytesInSegment;
    1948     24726693 :     bytesleft = bytepos % UsableBytesInSegment;
    1949              : 
    1950     24726693 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    1951              :     {
    1952              :         /* fits on first page of segment */
    1953       107121 :         if (bytesleft == 0)
    1954        71117 :             seg_offset = 0;
    1955              :         else
    1956        36004 :             seg_offset = bytesleft + SizeOfXLogLongPHD;
    1957              :     }
    1958              :     else
    1959              :     {
    1960              :         /* account for the first page on segment with long header */
    1961     24619572 :         seg_offset = XLOG_BLCKSZ;
    1962     24619572 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    1963              : 
    1964     24619572 :         fullpages = bytesleft / UsableBytesInPage;
    1965     24619572 :         bytesleft = bytesleft % UsableBytesInPage;
    1966              : 
    1967     24619572 :         if (bytesleft == 0)
    1968        23659 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
    1969              :         else
    1970     24595913 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    1971              :     }
    1972              : 
    1973     24726693 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
    1974              : 
    1975     24726693 :     return result;
    1976              : }
    1977              : 
    1978              : /*
    1979              :  * Convert an XLogRecPtr to a "usable byte position".
    1980              :  */
    1981              : static uint64
    1982         2765 : XLogRecPtrToBytePos(XLogRecPtr ptr)
    1983              : {
    1984              :     uint64      fullsegs;
    1985              :     uint32      fullpages;
    1986              :     uint32      offset;
    1987              :     uint64      result;
    1988              : 
    1989         2765 :     XLByteToSeg(ptr, fullsegs, wal_segment_size);
    1990              : 
    1991         2765 :     fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
    1992         2765 :     offset = ptr % XLOG_BLCKSZ;
    1993              : 
    1994         2765 :     if (fullpages == 0)
    1995              :     {
    1996         1059 :         result = fullsegs * UsableBytesInSegment;
    1997         1059 :         if (offset > 0)
    1998              :         {
    1999              :             Assert(offset >= SizeOfXLogLongPHD);
    2000          285 :             result += offset - SizeOfXLogLongPHD;
    2001              :         }
    2002              :     }
    2003              :     else
    2004              :     {
    2005         1706 :         result = fullsegs * UsableBytesInSegment +
    2006         1706 :             (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
    2007         1706 :             (fullpages - 1) * UsableBytesInPage;    /* full pages */
    2008         1706 :         if (offset > 0)
    2009              :         {
    2010              :             Assert(offset >= SizeOfXLogShortPHD);
    2011         1697 :             result += offset - SizeOfXLogShortPHD;
    2012              :         }
    2013              :     }
    2014              : 
    2015         2765 :     return result;
    2016              : }
    2017              : 
    2018              : /*
    2019              :  * Initialize XLOG buffers, writing out old buffers if they still contain
    2020              :  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
    2021              :  * true, initialize as many pages as we can without having to write out
    2022              :  * unwritten data. Any new pages are initialized to zeros, with pages headers
    2023              :  * initialized properly.
    2024              :  */
    2025              : static void
    2026      2708701 : AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
    2027              : {
    2028              :     int         nextidx;
    2029              :     XLogRecPtr  OldPageRqstPtr;
    2030              :     XLogwrtRqst WriteRqst;
    2031      2708701 :     XLogRecPtr  NewPageEndPtr = InvalidXLogRecPtr;
    2032              :     XLogRecPtr  NewPageBeginPtr;
    2033              :     XLogPageHeader NewPage;
    2034      2708701 :     int         npages pg_attribute_unused() = 0;
    2035              : 
    2036      2708701 :     LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
    2037              : 
    2038              :     /*
    2039              :      * Now that we have the lock, check if someone initialized the page
    2040              :      * already.
    2041              :      */
    2042      7847407 :     while (upto >= XLogCtl->InitializedUpTo || opportunistic)
    2043              :     {
    2044      5143563 :         nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
    2045              : 
    2046              :         /*
    2047              :          * Get ending-offset of the buffer page we need to replace (this may
    2048              :          * be zero if the buffer hasn't been used yet).  Fall through if it's
    2049              :          * already written out.
    2050              :          */
    2051      5143563 :         OldPageRqstPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]);
    2052      5143563 :         if (LogwrtResult.Write < OldPageRqstPtr)
    2053              :         {
    2054              :             /*
    2055              :              * Nope, got work to do. If we just want to pre-initialize as much
    2056              :              * as we can without flushing, give up now.
    2057              :              */
    2058      2358598 :             if (opportunistic)
    2059         4857 :                 break;
    2060              : 
    2061              :             /* Advance shared memory write request position */
    2062      2353741 :             SpinLockAcquire(&XLogCtl->info_lck);
    2063      2353741 :             if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
    2064       696853 :                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
    2065      2353741 :             SpinLockRelease(&XLogCtl->info_lck);
    2066              : 
    2067              :             /*
    2068              :              * Acquire an up-to-date LogwrtResult value and see if we still
    2069              :              * need to write it or if someone else already did.
    2070              :              */
    2071      2353741 :             RefreshXLogWriteResult(LogwrtResult);
    2072      2353741 :             if (LogwrtResult.Write < OldPageRqstPtr)
    2073              :             {
    2074              :                 /*
    2075              :                  * Must acquire write lock. Release WALBufMappingLock first,
    2076              :                  * to make sure that all insertions that we need to wait for
    2077              :                  * can finish (up to this same position). Otherwise we risk
    2078              :                  * deadlock.
    2079              :                  */
    2080      2335094 :                 LWLockRelease(WALBufMappingLock);
    2081              : 
    2082      2335094 :                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
    2083              : 
    2084      2335094 :                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    2085              : 
    2086      2335094 :                 RefreshXLogWriteResult(LogwrtResult);
    2087      2335094 :                 if (LogwrtResult.Write >= OldPageRqstPtr)
    2088              :                 {
    2089              :                     /* OK, someone wrote it already */
    2090       150752 :                     LWLockRelease(WALWriteLock);
    2091              :                 }
    2092              :                 else
    2093              :                 {
    2094              :                     /* Have to write it ourselves */
    2095              :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
    2096      2184342 :                     WriteRqst.Write = OldPageRqstPtr;
    2097      2184342 :                     WriteRqst.Flush = InvalidXLogRecPtr;
    2098      2184342 :                     XLogWrite(WriteRqst, tli, false);
    2099      2184342 :                     LWLockRelease(WALWriteLock);
    2100      2184342 :                     pgWalUsage.wal_buffers_full++;
    2101              :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
    2102              : 
    2103              :                     /*
    2104              :                      * Required for the flush of pending stats WAL data, per
    2105              :                      * update of pgWalUsage.
    2106              :                      */
    2107      2184342 :                     pgstat_report_fixed = true;
    2108              :                 }
    2109              :                 /* Re-acquire WALBufMappingLock and retry */
    2110      2335094 :                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
    2111      2335094 :                 continue;
    2112              :             }
    2113              :         }
    2114              : 
    2115              :         /*
    2116              :          * Now the next buffer slot is free and we can set it up to be the
    2117              :          * next output page.
    2118              :          */
    2119      2803612 :         NewPageBeginPtr = XLogCtl->InitializedUpTo;
    2120      2803612 :         NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
    2121              : 
    2122              :         Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
    2123              : 
    2124      2803612 :         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
    2125              : 
    2126              :         /*
    2127              :          * Mark the xlblock with InvalidXLogRecPtr and issue a write barrier
    2128              :          * before initializing. Otherwise, the old page may be partially
    2129              :          * zeroed but look valid.
    2130              :          */
    2131      2803612 :         pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], InvalidXLogRecPtr);
    2132      2803612 :         pg_write_barrier();
    2133              : 
    2134              :         /*
    2135              :          * Be sure to re-zero the buffer so that bytes beyond what we've
    2136              :          * written will look like zeroes and not valid XLOG records...
    2137              :          */
    2138      2803612 :         MemSet(NewPage, 0, XLOG_BLCKSZ);
    2139              : 
    2140              :         /*
    2141              :          * Fill the new page's header
    2142              :          */
    2143      2803612 :         NewPage->xlp_magic = XLOG_PAGE_MAGIC;
    2144              : 
    2145              :         /* NewPage->xlp_info = 0; */ /* done by memset */
    2146      2803612 :         NewPage->xlp_tli = tli;
    2147      2803612 :         NewPage->xlp_pageaddr = NewPageBeginPtr;
    2148              : 
    2149              :         /* NewPage->xlp_rem_len = 0; */  /* done by memset */
    2150              : 
    2151              :         /*
    2152              :          * If first page of an XLOG segment file, make it a long header.
    2153              :          */
    2154      2803612 :         if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
    2155              :         {
    2156         1927 :             XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
    2157              : 
    2158         1927 :             NewLongPage->xlp_sysid = ControlFile->system_identifier;
    2159         1927 :             NewLongPage->xlp_seg_size = wal_segment_size;
    2160         1927 :             NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    2161         1927 :             NewPage->xlp_info |= XLP_LONG_HEADER;
    2162              :         }
    2163              : 
    2164              :         /*
    2165              :          * Make sure the initialization of the page becomes visible to others
    2166              :          * before the xlblocks update. GetXLogBuffer() reads xlblocks without
    2167              :          * holding a lock.
    2168              :          */
    2169      2803612 :         pg_write_barrier();
    2170              : 
    2171      2803612 :         pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], NewPageEndPtr);
    2172      2803612 :         XLogCtl->InitializedUpTo = NewPageEndPtr;
    2173              : 
    2174      2803612 :         npages++;
    2175              :     }
    2176      2708701 :     LWLockRelease(WALBufMappingLock);
    2177              : 
    2178              : #ifdef WAL_DEBUG
    2179              :     if (XLOG_DEBUG && npages > 0)
    2180              :     {
    2181              :         elog(DEBUG1, "initialized %d pages, up to %X/%08X",
    2182              :              npages, LSN_FORMAT_ARGS(NewPageEndPtr));
    2183              :     }
    2184              : #endif
    2185      2708701 : }
    2186              : 
    2187              : /*
    2188              :  * Calculate CheckPointSegments based on max_wal_size_mb and
    2189              :  * checkpoint_completion_target.
    2190              :  */
    2191              : static void
    2192         9743 : CalculateCheckpointSegments(void)
    2193              : {
    2194              :     double      target;
    2195              : 
    2196              :     /*-------
    2197              :      * Calculate the distance at which to trigger a checkpoint, to avoid
    2198              :      * exceeding max_wal_size_mb. This is based on two assumptions:
    2199              :      *
    2200              :      * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
    2201              :      *    WAL for two checkpoint cycles to allow us to recover from the
    2202              :      *    secondary checkpoint if the first checkpoint failed, though we
    2203              :      *    only did this on the primary anyway, not on standby. Keeping just
    2204              :      *    one checkpoint simplifies processing and reduces disk space in
    2205              :      *    many smaller databases.)
    2206              :      * b) during checkpoint, we consume checkpoint_completion_target *
    2207              :      *    number of segments consumed between checkpoints.
    2208              :      *-------
    2209              :      */
    2210         9743 :     target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
    2211         9743 :         (1.0 + CheckPointCompletionTarget);
    2212              : 
    2213              :     /* round down */
    2214         9743 :     CheckPointSegments = (int) target;
    2215              : 
    2216         9743 :     if (CheckPointSegments < 1)
    2217           10 :         CheckPointSegments = 1;
    2218         9743 : }
    2219              : 
    2220              : void
    2221         7335 : assign_max_wal_size(int newval, void *extra)
    2222              : {
    2223         7335 :     max_wal_size_mb = newval;
    2224         7335 :     CalculateCheckpointSegments();
    2225         7335 : }
    2226              : 
    2227              : void
    2228         1275 : assign_checkpoint_completion_target(double newval, void *extra)
    2229              : {
    2230         1275 :     CheckPointCompletionTarget = newval;
    2231         1275 :     CalculateCheckpointSegments();
    2232         1275 : }
    2233              : 
    2234              : bool
    2235         2466 : check_wal_segment_size(int *newval, void **extra, GucSource source)
    2236              : {
    2237         2466 :     if (!IsValidWalSegSize(*newval))
    2238              :     {
    2239            0 :         GUC_check_errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.");
    2240            0 :         return false;
    2241              :     }
    2242              : 
    2243         2466 :     return true;
    2244              : }
    2245              : 
    2246              : /*
    2247              :  * At a checkpoint, how many WAL segments to recycle as preallocated future
    2248              :  * XLOG segments? Returns the highest segment that should be preallocated.
    2249              :  */
    2250              : static XLogSegNo
    2251         1931 : XLOGfileslop(XLogRecPtr lastredoptr)
    2252              : {
    2253              :     XLogSegNo   minSegNo;
    2254              :     XLogSegNo   maxSegNo;
    2255              :     double      distance;
    2256              :     XLogSegNo   recycleSegNo;
    2257              : 
    2258              :     /*
    2259              :      * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
    2260              :      * correspond to. Always recycle enough segments to meet the minimum, and
    2261              :      * remove enough segments to stay below the maximum.
    2262              :      */
    2263         1931 :     minSegNo = lastredoptr / wal_segment_size +
    2264         1931 :         ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
    2265         1931 :     maxSegNo = lastredoptr / wal_segment_size +
    2266         1931 :         ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
    2267              : 
    2268              :     /*
    2269              :      * Between those limits, recycle enough segments to get us through to the
    2270              :      * estimated end of next checkpoint.
    2271              :      *
    2272              :      * To estimate where the next checkpoint will finish, assume that the
    2273              :      * system runs steadily consuming CheckPointDistanceEstimate bytes between
    2274              :      * every checkpoint.
    2275              :      */
    2276         1931 :     distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
    2277              :     /* add 10% for good measure. */
    2278         1931 :     distance *= 1.10;
    2279              : 
    2280         1931 :     recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
    2281              :                                     wal_segment_size);
    2282              : 
    2283         1931 :     if (recycleSegNo < minSegNo)
    2284         1356 :         recycleSegNo = minSegNo;
    2285         1931 :     if (recycleSegNo > maxSegNo)
    2286          426 :         recycleSegNo = maxSegNo;
    2287              : 
    2288         1931 :     return recycleSegNo;
    2289              : }
    2290              : 
    2291              : /*
    2292              :  * Check whether we've consumed enough xlog space that a checkpoint is needed.
    2293              :  *
    2294              :  * new_segno indicates a log file that has just been filled up (or read
    2295              :  * during recovery). We measure the distance from RedoRecPtr to new_segno
    2296              :  * and see if that exceeds CheckPointSegments.
    2297              :  *
    2298              :  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
    2299              :  */
    2300              : bool
    2301         5104 : XLogCheckpointNeeded(XLogSegNo new_segno)
    2302              : {
    2303              :     XLogSegNo   old_segno;
    2304              : 
    2305         5104 :     XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
    2306              : 
    2307         5104 :     if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
    2308         3183 :         return true;
    2309         1921 :     return false;
    2310              : }
    2311              : 
    2312              : /*
    2313              :  * Write and/or fsync the log at least as far as WriteRqst indicates.
    2314              :  *
    2315              :  * If flexible == true, we don't have to write as far as WriteRqst, but
    2316              :  * may stop at any convenient boundary (such as a cache or logfile boundary).
    2317              :  * This option allows us to avoid uselessly issuing multiple writes when a
    2318              :  * single one would do.
    2319              :  *
    2320              :  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
    2321              :  * must be called before grabbing the lock, to make sure the data is ready to
    2322              :  * write.
    2323              :  */
    2324              : static void
    2325      2350077 : XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
    2326              : {
    2327              :     bool        ispartialpage;
    2328              :     bool        last_iteration;
    2329              :     bool        finishing_seg;
    2330              :     int         curridx;
    2331              :     int         npages;
    2332              :     int         startidx;
    2333              :     uint32      startoffset;
    2334              : 
    2335              :     /* We should always be inside a critical section here */
    2336              :     Assert(CritSectionCount > 0);
    2337              : 
    2338              :     /*
    2339              :      * Update local LogwrtResult (caller probably did this already, but...)
    2340              :      */
    2341      2350077 :     RefreshXLogWriteResult(LogwrtResult);
    2342              : 
    2343              :     /*
    2344              :      * Since successive pages in the xlog cache are consecutively allocated,
    2345              :      * we can usually gather multiple pages together and issue just one
    2346              :      * write() call.  npages is the number of pages we have determined can be
    2347              :      * written together; startidx is the cache block index of the first one,
    2348              :      * and startoffset is the file offset at which it should go. The latter
    2349              :      * two variables are only valid when npages > 0, but we must initialize
    2350              :      * all of them to keep the compiler quiet.
    2351              :      */
    2352      2350077 :     npages = 0;
    2353      2350077 :     startidx = 0;
    2354      2350077 :     startoffset = 0;
    2355              : 
    2356              :     /*
    2357              :      * Within the loop, curridx is the cache block index of the page to
    2358              :      * consider writing.  Begin at the buffer containing the next unwritten
    2359              :      * page, or last partially written page.
    2360              :      */
    2361      2350077 :     curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
    2362              : 
    2363      5112863 :     while (LogwrtResult.Write < WriteRqst.Write)
    2364              :     {
    2365              :         /*
    2366              :          * Make sure we're not ahead of the insert process.  This could happen
    2367              :          * if we're passed a bogus WriteRqst.Write that is past the end of the
    2368              :          * last page that's been initialized by AdvanceXLInsertBuffer.
    2369              :          */
    2370      2923570 :         XLogRecPtr  EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]);
    2371              : 
    2372      2923570 :         if (LogwrtResult.Write >= EndPtr)
    2373            0 :             elog(PANIC, "xlog write request %X/%08X is past end of log %X/%08X",
    2374              :                  LSN_FORMAT_ARGS(LogwrtResult.Write),
    2375              :                  LSN_FORMAT_ARGS(EndPtr));
    2376              : 
    2377              :         /* Advance LogwrtResult.Write to end of current buffer page */
    2378      2923570 :         LogwrtResult.Write = EndPtr;
    2379      2923570 :         ispartialpage = WriteRqst.Write < LogwrtResult.Write;
    2380              : 
    2381      2923570 :         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    2382              :                              wal_segment_size))
    2383              :         {
    2384              :             /*
    2385              :              * Switch to new logfile segment.  We cannot have any pending
    2386              :              * pages here (since we dump what we have at segment end).
    2387              :              */
    2388              :             Assert(npages == 0);
    2389        15129 :             if (openLogFile >= 0)
    2390         6840 :                 XLogFileClose();
    2391        15129 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2392              :                             wal_segment_size);
    2393        15129 :             openLogTLI = tli;
    2394              : 
    2395              :             /* create/use new log file */
    2396        15129 :             openLogFile = XLogFileInit(openLogSegNo, tli);
    2397        15129 :             ReserveExternalFD();
    2398              :         }
    2399              : 
    2400              :         /* Make sure we have the current logfile open */
    2401      2923570 :         if (openLogFile < 0)
    2402              :         {
    2403            0 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2404              :                             wal_segment_size);
    2405            0 :             openLogTLI = tli;
    2406            0 :             openLogFile = XLogFileOpen(openLogSegNo, tli);
    2407            0 :             ReserveExternalFD();
    2408              :         }
    2409              : 
    2410              :         /* Add current page to the set of pending pages-to-dump */
    2411      2923570 :         if (npages == 0)
    2412              :         {
    2413              :             /* first of group */
    2414      2367187 :             startidx = curridx;
    2415      2367187 :             startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
    2416              :                                             wal_segment_size);
    2417              :         }
    2418      2923570 :         npages++;
    2419              : 
    2420              :         /*
    2421              :          * Dump the set if this will be the last loop iteration, or if we are
    2422              :          * at the last page of the cache area (since the next page won't be
    2423              :          * contiguous in memory), or if we are at the end of the logfile
    2424              :          * segment.
    2425              :          */
    2426      2923570 :         last_iteration = WriteRqst.Write <= LogwrtResult.Write;
    2427              : 
    2428      5690240 :         finishing_seg = !ispartialpage &&
    2429      2766670 :             (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
    2430              : 
    2431      2923570 :         if (last_iteration ||
    2432       575146 :             curridx == XLogCtl->XLogCacheBlck ||
    2433              :             finishing_seg)
    2434              :         {
    2435              :             char       *from;
    2436              :             Size        nbytes;
    2437              :             Size        nleft;
    2438              :             ssize_t     written;
    2439              :             instr_time  start;
    2440              : 
    2441              :             /* OK to write the page(s) */
    2442      2367187 :             from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
    2443      2367187 :             nbytes = npages * (Size) XLOG_BLCKSZ;
    2444      2367187 :             nleft = nbytes;
    2445              :             do
    2446              :             {
    2447      2367187 :                 errno = 0;
    2448              : 
    2449              :                 /*
    2450              :                  * Measure I/O timing to write WAL data, for pg_stat_io.
    2451              :                  */
    2452      2367187 :                 start = pgstat_prepare_io_time(track_wal_io_timing);
    2453              : 
    2454      2367187 :                 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
    2455      2367187 :                 written = pg_pwrite(openLogFile, from, nleft, startoffset);
    2456      2367187 :                 pgstat_report_wait_end();
    2457              : 
    2458      2367187 :                 pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL,
    2459              :                                         IOOP_WRITE, start, 1, written);
    2460              : 
    2461      2367187 :                 if (written <= 0)
    2462              :                 {
    2463              :                     char        xlogfname[MAXFNAMELEN];
    2464              :                     int         save_errno;
    2465              : 
    2466            0 :                     if (errno == EINTR)
    2467            0 :                         continue;
    2468              : 
    2469            0 :                     save_errno = errno;
    2470            0 :                     XLogFileName(xlogfname, tli, openLogSegNo,
    2471              :                                  wal_segment_size);
    2472            0 :                     errno = save_errno;
    2473            0 :                     ereport(PANIC,
    2474              :                             (errcode_for_file_access(),
    2475              :                              errmsg("could not write to log file \"%s\" at offset %u, length %zu: %m",
    2476              :                                     xlogfname, startoffset, nleft)));
    2477              :                 }
    2478      2367187 :                 nleft -= written;
    2479      2367187 :                 from += written;
    2480      2367187 :                 startoffset += written;
    2481      2367187 :             } while (nleft > 0);
    2482              : 
    2483      2367187 :             npages = 0;
    2484              : 
    2485              :             /*
    2486              :              * If we just wrote the whole last page of a logfile segment,
    2487              :              * fsync the segment immediately.  This avoids having to go back
    2488              :              * and re-open prior segments when an fsync request comes along
    2489              :              * later. Doing it here ensures that one and only one backend will
    2490              :              * perform this fsync.
    2491              :              *
    2492              :              * This is also the right place to notify the Archiver that the
    2493              :              * segment is ready to copy to archival storage, and to update the
    2494              :              * timer for archive_timeout, and to signal for a checkpoint if
    2495              :              * too many logfile segments have been used since the last
    2496              :              * checkpoint.
    2497              :              */
    2498      2367187 :             if (finishing_seg)
    2499              :             {
    2500         2052 :                 issue_xlog_fsync(openLogFile, openLogSegNo, tli);
    2501              : 
    2502              :                 /* signal that we need to wakeup walsenders later */
    2503         2052 :                 WalSndWakeupRequest();
    2504              : 
    2505         2052 :                 LogwrtResult.Flush = LogwrtResult.Write;    /* end of page */
    2506              : 
    2507         2052 :                 if (XLogArchivingActive())
    2508          412 :                     XLogArchiveNotifySeg(openLogSegNo, tli);
    2509              : 
    2510         2052 :                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    2511         2052 :                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
    2512              : 
    2513              :                 /*
    2514              :                  * Request a checkpoint if we've consumed too much xlog since
    2515              :                  * the last one.  For speed, we first check using the local
    2516              :                  * copy of RedoRecPtr, which might be out of date; if it looks
    2517              :                  * like a checkpoint is needed, forcibly update RedoRecPtr and
    2518              :                  * recheck.
    2519              :                  */
    2520         2052 :                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
    2521              :                 {
    2522          273 :                     (void) GetRedoRecPtr();
    2523          273 :                     if (XLogCheckpointNeeded(openLogSegNo))
    2524          223 :                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
    2525              :                 }
    2526              :             }
    2527              :         }
    2528              : 
    2529      2923570 :         if (ispartialpage)
    2530              :         {
    2531              :             /* Only asked to write a partial page */
    2532       156900 :             LogwrtResult.Write = WriteRqst.Write;
    2533       156900 :             break;
    2534              :         }
    2535      2766670 :         curridx = NextBufIdx(curridx);
    2536              : 
    2537              :         /* If flexible, break out of loop as soon as we wrote something */
    2538      2766670 :         if (flexible && npages == 0)
    2539         3884 :             break;
    2540              :     }
    2541              : 
    2542              :     Assert(npages == 0);
    2543              : 
    2544              :     /*
    2545              :      * If asked to flush, do so
    2546              :      */
    2547      2350077 :     if (LogwrtResult.Flush < WriteRqst.Flush &&
    2548       164928 :         LogwrtResult.Flush < LogwrtResult.Write)
    2549              :     {
    2550              :         /*
    2551              :          * Could get here without iterating above loop, in which case we might
    2552              :          * have no open file or the wrong one.  However, we do not need to
    2553              :          * fsync more than one file.
    2554              :          */
    2555       164861 :         if (wal_sync_method != WAL_SYNC_METHOD_OPEN &&
    2556       164861 :             wal_sync_method != WAL_SYNC_METHOD_OPEN_DSYNC)
    2557              :         {
    2558       164861 :             if (openLogFile >= 0 &&
    2559       164842 :                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    2560              :                                  wal_segment_size))
    2561          110 :                 XLogFileClose();
    2562       164861 :             if (openLogFile < 0)
    2563              :             {
    2564          129 :                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2565              :                                 wal_segment_size);
    2566          129 :                 openLogTLI = tli;
    2567          129 :                 openLogFile = XLogFileOpen(openLogSegNo, tli);
    2568          129 :                 ReserveExternalFD();
    2569              :             }
    2570              : 
    2571       164861 :             issue_xlog_fsync(openLogFile, openLogSegNo, tli);
    2572              :         }
    2573              : 
    2574              :         /* signal that we need to wakeup walsenders later */
    2575       164861 :         WalSndWakeupRequest();
    2576              : 
    2577       164861 :         LogwrtResult.Flush = LogwrtResult.Write;
    2578              :     }
    2579              : 
    2580              :     /*
    2581              :      * Update shared-memory status
    2582              :      *
    2583              :      * We make sure that the shared 'request' values do not fall behind the
    2584              :      * 'result' values.  This is not absolutely essential, but it saves some
    2585              :      * code in a couple of places.
    2586              :      */
    2587      2350077 :     SpinLockAcquire(&XLogCtl->info_lck);
    2588      2350077 :     if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
    2589       145206 :         XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
    2590      2350077 :     if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
    2591       166482 :         XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
    2592      2350077 :     SpinLockRelease(&XLogCtl->info_lck);
    2593              : 
    2594              :     /*
    2595              :      * We write Write first, bar, then Flush.  When reading, the opposite must
    2596              :      * be done (with a matching barrier in between), so that we always see a
    2597              :      * Flush value that trails behind the Write value seen.
    2598              :      */
    2599      2350077 :     pg_atomic_write_u64(&XLogCtl->logWriteResult, LogwrtResult.Write);
    2600      2350077 :     pg_write_barrier();
    2601      2350077 :     pg_atomic_write_u64(&XLogCtl->logFlushResult, LogwrtResult.Flush);
    2602              : 
    2603              : #ifdef USE_ASSERT_CHECKING
    2604              :     {
    2605              :         XLogRecPtr  Flush;
    2606              :         XLogRecPtr  Write;
    2607              :         XLogRecPtr  Insert;
    2608              : 
    2609              :         Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult);
    2610              :         pg_read_barrier();
    2611              :         Write = pg_atomic_read_u64(&XLogCtl->logWriteResult);
    2612              :         pg_read_barrier();
    2613              :         Insert = pg_atomic_read_u64(&XLogCtl->logInsertResult);
    2614              : 
    2615              :         /* WAL written to disk is always ahead of WAL flushed */
    2616              :         Assert(Write >= Flush);
    2617              : 
    2618              :         /* WAL inserted to buffers is always ahead of WAL written */
    2619              :         Assert(Insert >= Write);
    2620              :     }
    2621              : #endif
    2622      2350077 : }
    2623              : 
    2624              : /*
    2625              :  * Record the LSN for an asynchronous transaction commit/abort
    2626              :  * and nudge the WALWriter if there is work for it to do.
    2627              :  * (This should not be called for synchronous commits.)
    2628              :  */
    2629              : void
    2630        61591 : XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
    2631              : {
    2632        61591 :     XLogRecPtr  WriteRqstPtr = asyncXactLSN;
    2633              :     bool        sleeping;
    2634        61591 :     bool        wakeup = false;
    2635              :     XLogRecPtr  prevAsyncXactLSN;
    2636              : 
    2637        61591 :     SpinLockAcquire(&XLogCtl->info_lck);
    2638        61591 :     sleeping = XLogCtl->WalWriterSleeping;
    2639        61591 :     prevAsyncXactLSN = XLogCtl->asyncXactLSN;
    2640        61591 :     if (XLogCtl->asyncXactLSN < asyncXactLSN)
    2641        61031 :         XLogCtl->asyncXactLSN = asyncXactLSN;
    2642        61591 :     SpinLockRelease(&XLogCtl->info_lck);
    2643              : 
    2644              :     /*
    2645              :      * If somebody else already called this function with a more aggressive
    2646              :      * LSN, they will have done what we needed (and perhaps more).
    2647              :      */
    2648        61591 :     if (asyncXactLSN <= prevAsyncXactLSN)
    2649          560 :         return;
    2650              : 
    2651              :     /*
    2652              :      * If the WALWriter is sleeping, kick it to make it come out of low-power
    2653              :      * mode, so that this async commit will reach disk within the expected
    2654              :      * amount of time.  Otherwise, determine whether it has enough WAL
    2655              :      * available to flush, the same way that XLogBackgroundFlush() does.
    2656              :      */
    2657        61031 :     if (sleeping)
    2658           45 :         wakeup = true;
    2659              :     else
    2660              :     {
    2661              :         int         flushblocks;
    2662              : 
    2663        60986 :         RefreshXLogWriteResult(LogwrtResult);
    2664              : 
    2665        60986 :         flushblocks =
    2666        60986 :             WriteRqstPtr / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
    2667              : 
    2668        60986 :         if (WalWriterFlushAfter == 0 || flushblocks >= WalWriterFlushAfter)
    2669         5079 :             wakeup = true;
    2670              :     }
    2671              : 
    2672        61031 :     if (wakeup)
    2673              :     {
    2674         5124 :         volatile PROC_HDR *procglobal = ProcGlobal;
    2675         5124 :         ProcNumber  walwriterProc = procglobal->walwriterProc;
    2676              : 
    2677         5124 :         if (walwriterProc != INVALID_PROC_NUMBER)
    2678          781 :             SetLatch(&GetPGProcByNumber(walwriterProc)->procLatch);
    2679              :     }
    2680              : }
    2681              : 
    2682              : /*
    2683              :  * Record the LSN up to which we can remove WAL because it's not required by
    2684              :  * any replication slot.
    2685              :  */
    2686              : void
    2687        42558 : XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
    2688              : {
    2689        42558 :     SpinLockAcquire(&XLogCtl->info_lck);
    2690        42558 :     XLogCtl->replicationSlotMinLSN = lsn;
    2691        42558 :     SpinLockRelease(&XLogCtl->info_lck);
    2692        42558 : }
    2693              : 
    2694              : 
    2695              : /*
    2696              :  * Return the oldest LSN we must retain to satisfy the needs of some
    2697              :  * replication slot.
    2698              :  */
    2699              : XLogRecPtr
    2700         2545 : XLogGetReplicationSlotMinimumLSN(void)
    2701              : {
    2702              :     XLogRecPtr  retval;
    2703              : 
    2704         2545 :     SpinLockAcquire(&XLogCtl->info_lck);
    2705         2545 :     retval = XLogCtl->replicationSlotMinLSN;
    2706         2545 :     SpinLockRelease(&XLogCtl->info_lck);
    2707              : 
    2708         2545 :     return retval;
    2709              : }
    2710              : 
    2711              : /*
    2712              :  * Advance minRecoveryPoint in control file.
    2713              :  *
    2714              :  * If we crash during recovery, we must reach this point again before the
    2715              :  * database is consistent.
    2716              :  *
    2717              :  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
    2718              :  * is only updated if it's not already greater than or equal to 'lsn'.
    2719              :  */
    2720              : static void
    2721       122360 : UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
    2722              : {
    2723              :     /* Quick check using our local copy of the variable */
    2724       122360 :     if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
    2725       114985 :         return;
    2726              : 
    2727              :     /*
    2728              :      * An invalid minRecoveryPoint means that we need to recover all the WAL,
    2729              :      * i.e., we're doing crash recovery.  We never modify the control file's
    2730              :      * value in that case, so we can short-circuit future checks here too. The
    2731              :      * local values of minRecoveryPoint and minRecoveryPointTLI should not be
    2732              :      * updated until crash recovery finishes.  We only do this for the startup
    2733              :      * process as it should not update its own reference of minRecoveryPoint
    2734              :      * until it has finished crash recovery to make sure that all WAL
    2735              :      * available is replayed in this case.  This also saves from extra locks
    2736              :      * taken on the control file from the startup process.
    2737              :      */
    2738         7375 :     if (!XLogRecPtrIsValid(LocalMinRecoveryPoint) && InRecovery)
    2739              :     {
    2740           32 :         updateMinRecoveryPoint = false;
    2741           32 :         return;
    2742              :     }
    2743              : 
    2744         7343 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    2745              : 
    2746              :     /* update local copy */
    2747         7343 :     LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    2748         7343 :     LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    2749              : 
    2750         7343 :     if (!XLogRecPtrIsValid(LocalMinRecoveryPoint))
    2751            1 :         updateMinRecoveryPoint = false;
    2752         7342 :     else if (force || LocalMinRecoveryPoint < lsn)
    2753              :     {
    2754              :         XLogRecPtr  newMinRecoveryPoint;
    2755              :         TimeLineID  newMinRecoveryPointTLI;
    2756              : 
    2757              :         /*
    2758              :          * To avoid having to update the control file too often, we update it
    2759              :          * all the way to the last record being replayed, even though 'lsn'
    2760              :          * would suffice for correctness.  This also allows the 'force' case
    2761              :          * to not need a valid 'lsn' value.
    2762              :          *
    2763              :          * Another important reason for doing it this way is that the passed
    2764              :          * 'lsn' value could be bogus, i.e., past the end of available WAL, if
    2765              :          * the caller got it from a corrupted heap page.  Accepting such a
    2766              :          * value as the min recovery point would prevent us from coming up at
    2767              :          * all.  Instead, we just log a warning and continue with recovery.
    2768              :          * (See also the comments about corrupt LSNs in XLogFlush.)
    2769              :          */
    2770         5884 :         newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
    2771         5884 :         if (!force && newMinRecoveryPoint < lsn)
    2772            0 :             elog(WARNING,
    2773              :                  "xlog min recovery request %X/%08X is past current point %X/%08X",
    2774              :                  LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
    2775              : 
    2776              :         /* update control file */
    2777         5884 :         if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
    2778              :         {
    2779         5530 :             ControlFile->minRecoveryPoint = newMinRecoveryPoint;
    2780         5530 :             ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
    2781         5530 :             UpdateControlFile();
    2782         5530 :             LocalMinRecoveryPoint = newMinRecoveryPoint;
    2783         5530 :             LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
    2784              : 
    2785         5530 :             ereport(DEBUG2,
    2786              :                     errmsg_internal("updated min recovery point to %X/%08X on timeline %u",
    2787              :                                     LSN_FORMAT_ARGS(newMinRecoveryPoint),
    2788              :                                     newMinRecoveryPointTLI));
    2789              :         }
    2790              :     }
    2791         7343 :     LWLockRelease(ControlFileLock);
    2792              : }
    2793              : 
    2794              : /*
    2795              :  * Ensure that all XLOG data through the given position is flushed to disk.
    2796              :  *
    2797              :  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
    2798              :  * already held, and we try to avoid acquiring it if possible.
    2799              :  */
    2800              : void
    2801       865060 : XLogFlush(XLogRecPtr record)
    2802              : {
    2803              :     XLogRecPtr  WriteRqstPtr;
    2804              :     XLogwrtRqst WriteRqst;
    2805       865060 :     TimeLineID  insertTLI = XLogCtl->InsertTimeLineID;
    2806              : 
    2807              :     /*
    2808              :      * During REDO, we are reading not writing WAL.  Therefore, instead of
    2809              :      * trying to flush the WAL, we should update minRecoveryPoint instead. We
    2810              :      * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
    2811              :      * to act this way too, and because when it tries to write the
    2812              :      * end-of-recovery checkpoint, it should indeed flush.
    2813              :      */
    2814       865060 :     if (!XLogInsertAllowed())
    2815              :     {
    2816       121901 :         UpdateMinRecoveryPoint(record, false);
    2817       688319 :         return;
    2818              :     }
    2819              : 
    2820              :     /* Quick exit if already known flushed */
    2821       743159 :     if (record <= LogwrtResult.Flush)
    2822       566418 :         return;
    2823              : 
    2824              : #ifdef WAL_DEBUG
    2825              :     if (XLOG_DEBUG)
    2826              :         elog(LOG, "xlog flush request %X/%08X; write %X/%08X; flush %X/%08X",
    2827              :              LSN_FORMAT_ARGS(record),
    2828              :              LSN_FORMAT_ARGS(LogwrtResult.Write),
    2829              :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    2830              : #endif
    2831              : 
    2832       176741 :     START_CRIT_SECTION();
    2833              : 
    2834              :     /*
    2835              :      * Since fsync is usually a horribly expensive operation, we try to
    2836              :      * piggyback as much data as we can on each fsync: if we see any more data
    2837              :      * entered into the xlog buffer, we'll write and fsync that too, so that
    2838              :      * the final value of LogwrtResult.Flush is as large as possible. This
    2839              :      * gives us some chance of avoiding another fsync immediately after.
    2840              :      */
    2841              : 
    2842              :     /* initialize to given target; may increase below */
    2843       176741 :     WriteRqstPtr = record;
    2844              : 
    2845              :     /*
    2846              :      * Now wait until we get the write lock, or someone else does the flush
    2847              :      * for us.
    2848              :      */
    2849              :     for (;;)
    2850         3683 :     {
    2851              :         XLogRecPtr  insertpos;
    2852              : 
    2853              :         /* done already? */
    2854       180424 :         RefreshXLogWriteResult(LogwrtResult);
    2855       180424 :         if (record <= LogwrtResult.Flush)
    2856        13343 :             break;
    2857              : 
    2858              :         /*
    2859              :          * Before actually performing the write, wait for all in-flight
    2860              :          * insertions to the pages we're about to write to finish.
    2861              :          */
    2862       167081 :         SpinLockAcquire(&XLogCtl->info_lck);
    2863       167081 :         if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
    2864        11886 :             WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
    2865       167081 :         SpinLockRelease(&XLogCtl->info_lck);
    2866       167081 :         insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
    2867              : 
    2868              :         /*
    2869              :          * Try to get the write lock. If we can't get it immediately, wait
    2870              :          * until it's released, and recheck if we still need to do the flush
    2871              :          * or if the backend that held the lock did it for us already. This
    2872              :          * helps to maintain a good rate of group committing when the system
    2873              :          * is bottlenecked by the speed of fsyncing.
    2874              :          */
    2875       167081 :         if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
    2876              :         {
    2877              :             /*
    2878              :              * The lock is now free, but we didn't acquire it yet. Before we
    2879              :              * do, loop back to check if someone else flushed the record for
    2880              :              * us already.
    2881              :              */
    2882         3683 :             continue;
    2883              :         }
    2884              : 
    2885              :         /* Got the lock; recheck whether request is satisfied */
    2886       163398 :         RefreshXLogWriteResult(LogwrtResult);
    2887       163398 :         if (record <= LogwrtResult.Flush)
    2888              :         {
    2889         2443 :             LWLockRelease(WALWriteLock);
    2890         2443 :             break;
    2891              :         }
    2892              : 
    2893              :         /*
    2894              :          * Sleep before flush! By adding a delay here, we may give further
    2895              :          * backends the opportunity to join the backlog of group commit
    2896              :          * followers; this can significantly improve transaction throughput,
    2897              :          * at the risk of increasing transaction latency.
    2898              :          *
    2899              :          * We do not sleep if enableFsync is not turned on, nor if there are
    2900              :          * fewer than CommitSiblings other backends with active transactions.
    2901              :          */
    2902       160955 :         if (CommitDelay > 0 && enableFsync &&
    2903            0 :             MinimumActiveBackends(CommitSiblings))
    2904              :         {
    2905            0 :             pgstat_report_wait_start(WAIT_EVENT_COMMIT_DELAY);
    2906            0 :             pg_usleep(CommitDelay);
    2907            0 :             pgstat_report_wait_end();
    2908              : 
    2909              :             /*
    2910              :              * Re-check how far we can now flush the WAL. It's generally not
    2911              :              * safe to call WaitXLogInsertionsToFinish while holding
    2912              :              * WALWriteLock, because an in-progress insertion might need to
    2913              :              * also grab WALWriteLock to make progress. But we know that all
    2914              :              * the insertions up to insertpos have already finished, because
    2915              :              * that's what the earlier WaitXLogInsertionsToFinish() returned.
    2916              :              * We're only calling it again to allow insertpos to be moved
    2917              :              * further forward, not to actually wait for anyone.
    2918              :              */
    2919            0 :             insertpos = WaitXLogInsertionsToFinish(insertpos);
    2920              :         }
    2921              : 
    2922              :         /* try to write/flush later additions to XLOG as well */
    2923       160955 :         WriteRqst.Write = insertpos;
    2924       160955 :         WriteRqst.Flush = insertpos;
    2925              : 
    2926       160955 :         XLogWrite(WriteRqst, insertTLI, false);
    2927              : 
    2928       160955 :         LWLockRelease(WALWriteLock);
    2929              :         /* done */
    2930       160955 :         break;
    2931              :     }
    2932              : 
    2933       176741 :     END_CRIT_SECTION();
    2934              : 
    2935              :     /* wake up walsenders now that we've released heavily contended locks */
    2936       176741 :     WalSndWakeupProcessRequests(true, !RecoveryInProgress());
    2937              : 
    2938              :     /*
    2939              :      * If we flushed an LSN that someone was waiting for, notify the waiters.
    2940              :      */
    2941       353482 :     if (waitLSNState &&
    2942       176741 :         (LogwrtResult.Flush >=
    2943       176741 :          pg_atomic_read_u64(&waitLSNState->minWaitedLSN[WAIT_LSN_TYPE_PRIMARY_FLUSH])))
    2944            9 :         WaitLSNWakeup(WAIT_LSN_TYPE_PRIMARY_FLUSH, LogwrtResult.Flush);
    2945              : 
    2946              :     /*
    2947              :      * If we still haven't flushed to the request point then we have a
    2948              :      * problem; most likely, the requested flush point is past end of XLOG.
    2949              :      * This has been seen to occur when a disk page has a corrupted LSN.
    2950              :      *
    2951              :      * Formerly we treated this as a PANIC condition, but that hurts the
    2952              :      * system's robustness rather than helping it: we do not want to take down
    2953              :      * the whole system due to corruption on one data page.  In particular, if
    2954              :      * the bad page is encountered again during recovery then we would be
    2955              :      * unable to restart the database at all!  (This scenario actually
    2956              :      * happened in the field several times with 7.1 releases.)  As of 8.4, bad
    2957              :      * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
    2958              :      * the only time we can reach here during recovery is while flushing the
    2959              :      * end-of-recovery checkpoint record, and we don't expect that to have a
    2960              :      * bad LSN.
    2961              :      *
    2962              :      * Note that for calls from xact.c, the ERROR will be promoted to PANIC
    2963              :      * since xact.c calls this routine inside a critical section.  However,
    2964              :      * calls from bufmgr.c are not within critical sections and so we will not
    2965              :      * force a restart for a bad LSN on a data page.
    2966              :      */
    2967       176741 :     if (LogwrtResult.Flush < record)
    2968            0 :         elog(ERROR,
    2969              :              "xlog flush request %X/%08X is not satisfied --- flushed only to %X/%08X",
    2970              :              LSN_FORMAT_ARGS(record),
    2971              :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    2972              : 
    2973              :     /*
    2974              :      * Cross-check XLogNeedsFlush().  Some of the checks of XLogFlush() and
    2975              :      * XLogNeedsFlush() are duplicated, and this assertion ensures that these
    2976              :      * remain consistent.
    2977              :      */
    2978              :     Assert(!XLogNeedsFlush(record));
    2979              : }
    2980              : 
    2981              : /*
    2982              :  * Write & flush xlog, but without specifying exactly where to.
    2983              :  *
    2984              :  * We normally write only completed blocks; but if there is nothing to do on
    2985              :  * that basis, we check for unwritten async commits in the current incomplete
    2986              :  * block, and write through the latest one of those.  Thus, if async commits
    2987              :  * are not being used, we will write complete blocks only.
    2988              :  *
    2989              :  * If, based on the above, there's anything to write we do so immediately. But
    2990              :  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
    2991              :  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
    2992              :  * more than wal_writer_flush_after unflushed blocks.
    2993              :  *
    2994              :  * We can guarantee that async commits reach disk after at most three
    2995              :  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
    2996              :  * to write "flexibly", meaning it can stop at the end of the buffer ring;
    2997              :  * this makes a difference only with very high load or long wal_writer_delay,
    2998              :  * but imposes one extra cycle for the worst case for async commits.)
    2999              :  *
    3000              :  * This routine is invoked periodically by the background walwriter process.
    3001              :  *
    3002              :  * Returns true if there was any work to do, even if we skipped flushing due
    3003              :  * to wal_writer_delay/wal_writer_flush_after.
    3004              :  */
    3005              : bool
    3006        15712 : XLogBackgroundFlush(void)
    3007              : {
    3008              :     XLogwrtRqst WriteRqst;
    3009        15712 :     bool        flexible = true;
    3010              :     static TimestampTz lastflush;
    3011              :     TimestampTz now;
    3012              :     int         flushblocks;
    3013              :     TimeLineID  insertTLI;
    3014              : 
    3015              :     /* XLOG doesn't need flushing during recovery */
    3016        15712 :     if (RecoveryInProgress())
    3017            0 :         return false;
    3018              : 
    3019              :     /*
    3020              :      * Since we're not in recovery, InsertTimeLineID is set and can't change,
    3021              :      * so we can read it without a lock.
    3022              :      */
    3023        15712 :     insertTLI = XLogCtl->InsertTimeLineID;
    3024              : 
    3025              :     /* read updated LogwrtRqst */
    3026        15712 :     SpinLockAcquire(&XLogCtl->info_lck);
    3027        15712 :     WriteRqst = XLogCtl->LogwrtRqst;
    3028        15712 :     SpinLockRelease(&XLogCtl->info_lck);
    3029              : 
    3030              :     /* back off to last completed page boundary */
    3031        15712 :     WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
    3032              : 
    3033              :     /* if we have already flushed that far, consider async commit records */
    3034        15712 :     RefreshXLogWriteResult(LogwrtResult);
    3035        15712 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    3036              :     {
    3037        11700 :         SpinLockAcquire(&XLogCtl->info_lck);
    3038        11700 :         WriteRqst.Write = XLogCtl->asyncXactLSN;
    3039        11700 :         SpinLockRelease(&XLogCtl->info_lck);
    3040        11700 :         flexible = false;       /* ensure it all gets written */
    3041              :     }
    3042              : 
    3043              :     /*
    3044              :      * If already known flushed, we're done. Just need to check if we are
    3045              :      * holding an open file handle to a logfile that's no longer in use,
    3046              :      * preventing the file from being deleted.
    3047              :      */
    3048        15712 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    3049              :     {
    3050        10855 :         if (openLogFile >= 0)
    3051              :         {
    3052         6696 :             if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    3053              :                                  wal_segment_size))
    3054              :             {
    3055          175 :                 XLogFileClose();
    3056              :             }
    3057              :         }
    3058        10855 :         return false;
    3059              :     }
    3060              : 
    3061              :     /*
    3062              :      * Determine how far to flush WAL, based on the wal_writer_delay and
    3063              :      * wal_writer_flush_after GUCs.
    3064              :      *
    3065              :      * Note that XLogSetAsyncXactLSN() performs similar calculation based on
    3066              :      * wal_writer_flush_after, to decide when to wake us up.  Make sure the
    3067              :      * logic is the same in both places if you change this.
    3068              :      */
    3069         4857 :     now = GetCurrentTimestamp();
    3070         4857 :     flushblocks =
    3071         4857 :         WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
    3072              : 
    3073         4857 :     if (WalWriterFlushAfter == 0 || lastflush == 0)
    3074              :     {
    3075              :         /* first call, or block based limits disabled */
    3076          285 :         WriteRqst.Flush = WriteRqst.Write;
    3077          285 :         lastflush = now;
    3078              :     }
    3079         4572 :     else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
    3080              :     {
    3081              :         /*
    3082              :          * Flush the writes at least every WalWriterDelay ms. This is
    3083              :          * important to bound the amount of time it takes for an asynchronous
    3084              :          * commit to hit disk.
    3085              :          */
    3086         4268 :         WriteRqst.Flush = WriteRqst.Write;
    3087         4268 :         lastflush = now;
    3088              :     }
    3089          304 :     else if (flushblocks >= WalWriterFlushAfter)
    3090              :     {
    3091              :         /* exceeded wal_writer_flush_after blocks, flush */
    3092          252 :         WriteRqst.Flush = WriteRqst.Write;
    3093          252 :         lastflush = now;
    3094              :     }
    3095              :     else
    3096              :     {
    3097              :         /* no flushing, this time round */
    3098           52 :         WriteRqst.Flush = InvalidXLogRecPtr;
    3099              :     }
    3100              : 
    3101              : #ifdef WAL_DEBUG
    3102              :     if (XLOG_DEBUG)
    3103              :         elog(LOG, "xlog bg flush request write %X/%08X; flush: %X/%08X, current is write %X/%08X; flush %X/%08X",
    3104              :              LSN_FORMAT_ARGS(WriteRqst.Write),
    3105              :              LSN_FORMAT_ARGS(WriteRqst.Flush),
    3106              :              LSN_FORMAT_ARGS(LogwrtResult.Write),
    3107              :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    3108              : #endif
    3109              : 
    3110         4857 :     START_CRIT_SECTION();
    3111              : 
    3112              :     /* now wait for any in-progress insertions to finish and get write lock */
    3113         4857 :     WaitXLogInsertionsToFinish(WriteRqst.Write);
    3114         4857 :     LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    3115         4857 :     RefreshXLogWriteResult(LogwrtResult);
    3116         4857 :     if (WriteRqst.Write > LogwrtResult.Write ||
    3117          139 :         WriteRqst.Flush > LogwrtResult.Flush)
    3118              :     {
    3119         4780 :         XLogWrite(WriteRqst, insertTLI, flexible);
    3120              :     }
    3121         4857 :     LWLockRelease(WALWriteLock);
    3122              : 
    3123         4857 :     END_CRIT_SECTION();
    3124              : 
    3125              :     /* wake up walsenders now that we've released heavily contended locks */
    3126         4857 :     WalSndWakeupProcessRequests(true, !RecoveryInProgress());
    3127              : 
    3128              :     /*
    3129              :      * If we flushed an LSN that someone was waiting for, notify the waiters.
    3130              :      */
    3131         9714 :     if (waitLSNState &&
    3132         4857 :         (LogwrtResult.Flush >=
    3133         4857 :          pg_atomic_read_u64(&waitLSNState->minWaitedLSN[WAIT_LSN_TYPE_PRIMARY_FLUSH])))
    3134            0 :         WaitLSNWakeup(WAIT_LSN_TYPE_PRIMARY_FLUSH, LogwrtResult.Flush);
    3135              : 
    3136              :     /*
    3137              :      * Great, done. To take some work off the critical path, try to initialize
    3138              :      * as many of the no-longer-needed WAL buffers for future use as we can.
    3139              :      */
    3140         4857 :     AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
    3141              : 
    3142              :     /*
    3143              :      * If we determined that we need to write data, but somebody else
    3144              :      * wrote/flushed already, it should be considered as being active, to
    3145              :      * avoid hibernating too early.
    3146              :      */
    3147         4857 :     return true;
    3148              : }
    3149              : 
    3150              : /*
    3151              :  * Test whether XLOG data has been flushed up to (at least) the given
    3152              :  * position, or whether the minimum recovery point has been updated past
    3153              :  * the given position.
    3154              :  *
    3155              :  * Returns true if a flush is still needed, or if the minimum recovery point
    3156              :  * must be updated.
    3157              :  *
    3158              :  * It is possible that someone else is already in the process of flushing
    3159              :  * that far, or has updated the minimum recovery point up to the given
    3160              :  * position.
    3161              :  */
    3162              : bool
    3163     16187133 : XLogNeedsFlush(XLogRecPtr record)
    3164              : {
    3165              :     /*
    3166              :      * During recovery, we don't flush WAL but update minRecoveryPoint
    3167              :      * instead. So "needs flush" is taken to mean whether minRecoveryPoint
    3168              :      * would need to be updated.
    3169              :      *
    3170              :      * Using XLogInsertAllowed() rather than RecoveryInProgress() matters for
    3171              :      * the case of an end-of-recovery checkpoint, where WAL data is flushed.
    3172              :      * This check should be consistent with the one in XLogFlush().
    3173              :      */
    3174     16187133 :     if (!XLogInsertAllowed())
    3175              :     {
    3176              :         /* Quick exit if already known to be updated or cannot be updated */
    3177       526211 :         if (!updateMinRecoveryPoint || record <= LocalMinRecoveryPoint)
    3178       510259 :             return false;
    3179              : 
    3180              :         /*
    3181              :          * An invalid minRecoveryPoint means that we need to recover all the
    3182              :          * WAL, i.e., we're doing crash recovery.  We never modify the control
    3183              :          * file's value in that case, so we can short-circuit future checks
    3184              :          * here too.  This triggers a quick exit path for the startup process,
    3185              :          * which cannot update its local copy of minRecoveryPoint as long as
    3186              :          * it has not replayed all WAL available when doing crash recovery.
    3187              :          */
    3188        15952 :         if (!XLogRecPtrIsValid(LocalMinRecoveryPoint) && InRecovery)
    3189              :         {
    3190            0 :             updateMinRecoveryPoint = false;
    3191            0 :             return false;
    3192              :         }
    3193              : 
    3194              :         /*
    3195              :          * Update local copy of minRecoveryPoint. But if the lock is busy,
    3196              :          * just return a conservative guess.
    3197              :          */
    3198        15952 :         if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
    3199            0 :             return true;
    3200        15952 :         LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    3201        15952 :         LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    3202        15952 :         LWLockRelease(ControlFileLock);
    3203              : 
    3204              :         /*
    3205              :          * Check minRecoveryPoint for any other process than the startup
    3206              :          * process doing crash recovery, which should not update the control
    3207              :          * file value if crash recovery is still running.
    3208              :          */
    3209        15952 :         if (!XLogRecPtrIsValid(LocalMinRecoveryPoint))
    3210            0 :             updateMinRecoveryPoint = false;
    3211              : 
    3212              :         /* check again */
    3213        15952 :         if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
    3214          100 :             return false;
    3215              :         else
    3216        15852 :             return true;
    3217              :     }
    3218              : 
    3219              :     /* Quick exit if already known flushed */
    3220     15660922 :     if (record <= LogwrtResult.Flush)
    3221     15460757 :         return false;
    3222              : 
    3223              :     /* read LogwrtResult and update local state */
    3224       200165 :     RefreshXLogWriteResult(LogwrtResult);
    3225              : 
    3226              :     /* check again */
    3227       200165 :     if (record <= LogwrtResult.Flush)
    3228         2908 :         return false;
    3229              : 
    3230       197257 :     return true;
    3231              : }
    3232              : 
    3233              : /*
    3234              :  * Try to make a given XLOG file segment exist.
    3235              :  *
    3236              :  * logsegno: identify segment.
    3237              :  *
    3238              :  * *added: on return, true if this call raised the number of extant segments.
    3239              :  *
    3240              :  * path: on return, this char[MAXPGPATH] has the path to the logsegno file.
    3241              :  *
    3242              :  * Returns -1 or FD of opened file.  A -1 here is not an error; a caller
    3243              :  * wanting an open segment should attempt to open "path", which usually will
    3244              :  * succeed.  (This is weird, but it's efficient for the callers.)
    3245              :  */
    3246              : static int
    3247        16306 : XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
    3248              :                      bool *added, char *path)
    3249              : {
    3250              :     char        tmppath[MAXPGPATH];
    3251              :     XLogSegNo   installed_segno;
    3252              :     XLogSegNo   max_segno;
    3253              :     int         fd;
    3254              :     int         save_errno;
    3255        16306 :     int         open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
    3256              :     instr_time  io_start;
    3257              : 
    3258              :     Assert(logtli != 0);
    3259              : 
    3260        16306 :     XLogFilePath(path, logtli, logsegno, wal_segment_size);
    3261              : 
    3262              :     /*
    3263              :      * Try to use existent file (checkpoint maker may have created it already)
    3264              :      */
    3265        16306 :     *added = false;
    3266        16306 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    3267        16306 :                        get_sync_bit(wal_sync_method));
    3268        16306 :     if (fd < 0)
    3269              :     {
    3270         1443 :         if (errno != ENOENT)
    3271            0 :             ereport(ERROR,
    3272              :                     (errcode_for_file_access(),
    3273              :                      errmsg("could not open file \"%s\": %m", path)));
    3274              :     }
    3275              :     else
    3276        14863 :         return fd;
    3277              : 
    3278              :     /*
    3279              :      * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
    3280              :      * another process is doing the same thing.  If so, we will end up
    3281              :      * pre-creating an extra log segment.  That seems OK, and better than
    3282              :      * holding the lock throughout this lengthy process.
    3283              :      */
    3284         1443 :     elog(DEBUG2, "creating and filling new WAL file");
    3285              : 
    3286         1443 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3287              : 
    3288         1443 :     unlink(tmppath);
    3289              : 
    3290         1443 :     if (io_direct_flags & IO_DIRECT_WAL_INIT)
    3291            0 :         open_flags |= PG_O_DIRECT;
    3292              : 
    3293              :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    3294         1443 :     fd = BasicOpenFile(tmppath, open_flags);
    3295         1443 :     if (fd < 0)
    3296            0 :         ereport(ERROR,
    3297              :                 (errcode_for_file_access(),
    3298              :                  errmsg("could not create file \"%s\": %m", tmppath)));
    3299              : 
    3300              :     /* Measure I/O timing when initializing segment */
    3301         1443 :     io_start = pgstat_prepare_io_time(track_wal_io_timing);
    3302              : 
    3303         1443 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
    3304         1443 :     save_errno = 0;
    3305         1443 :     if (wal_init_zero)
    3306              :     {
    3307              :         ssize_t     rc;
    3308              : 
    3309              :         /*
    3310              :          * Zero-fill the file.  With this setting, we do this the hard way to
    3311              :          * ensure that all the file space has really been allocated.  On
    3312              :          * platforms that allow "holes" in files, just seeking to the end
    3313              :          * doesn't allocate intermediate space.  This way, we know that we
    3314              :          * have all the space and (after the fsync below) that all the
    3315              :          * indirect blocks are down on disk.  Therefore, fdatasync(2) or
    3316              :          * O_DSYNC will be sufficient to sync future writes to the log file.
    3317              :          */
    3318         1443 :         rc = pg_pwrite_zeros(fd, wal_segment_size, 0);
    3319              : 
    3320         1443 :         if (rc < 0)
    3321            0 :             save_errno = errno;
    3322              :     }
    3323              :     else
    3324              :     {
    3325              :         /*
    3326              :          * Otherwise, seeking to the end and writing a solitary byte is
    3327              :          * enough.
    3328              :          */
    3329            0 :         errno = 0;
    3330            0 :         if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
    3331              :         {
    3332              :             /* if write didn't set errno, assume no disk space */
    3333            0 :             save_errno = errno ? errno : ENOSPC;
    3334              :         }
    3335              :     }
    3336         1443 :     pgstat_report_wait_end();
    3337              : 
    3338              :     /*
    3339              :      * A full segment worth of data is written when using wal_init_zero. One
    3340              :      * byte is written when not using it.
    3341              :      */
    3342         1443 :     pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE,
    3343              :                             io_start, 1,
    3344         1443 :                             wal_init_zero ? wal_segment_size : 1);
    3345              : 
    3346         1443 :     if (save_errno)
    3347              :     {
    3348              :         /*
    3349              :          * If we fail to make the file, delete it to release disk space
    3350              :          */
    3351            0 :         unlink(tmppath);
    3352              : 
    3353            0 :         close(fd);
    3354              : 
    3355            0 :         errno = save_errno;
    3356              : 
    3357            0 :         ereport(ERROR,
    3358              :                 (errcode_for_file_access(),
    3359              :                  errmsg("could not write to file \"%s\": %m", tmppath)));
    3360              :     }
    3361              : 
    3362              :     /* Measure I/O timing when flushing segment */
    3363         1443 :     io_start = pgstat_prepare_io_time(track_wal_io_timing);
    3364              : 
    3365         1443 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
    3366         1443 :     if (pg_fsync(fd) != 0)
    3367              :     {
    3368            0 :         save_errno = errno;
    3369            0 :         close(fd);
    3370            0 :         errno = save_errno;
    3371            0 :         ereport(ERROR,
    3372              :                 (errcode_for_file_access(),
    3373              :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3374              :     }
    3375         1443 :     pgstat_report_wait_end();
    3376              : 
    3377         1443 :     pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT,
    3378              :                             IOOP_FSYNC, io_start, 1, 0);
    3379              : 
    3380         1443 :     if (close(fd) != 0)
    3381            0 :         ereport(ERROR,
    3382              :                 (errcode_for_file_access(),
    3383              :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3384              : 
    3385              :     /*
    3386              :      * Now move the segment into place with its final name.  Cope with
    3387              :      * possibility that someone else has created the file while we were
    3388              :      * filling ours: if so, use ours to pre-create a future log segment.
    3389              :      */
    3390         1443 :     installed_segno = logsegno;
    3391              : 
    3392              :     /*
    3393              :      * XXX: What should we use as max_segno? We used to use XLOGfileslop when
    3394              :      * that was a constant, but that was always a bit dubious: normally, at a
    3395              :      * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
    3396              :      * here, it was the offset from the insert location. We can't do the
    3397              :      * normal XLOGfileslop calculation here because we don't have access to
    3398              :      * the prior checkpoint's redo location. So somewhat arbitrarily, just use
    3399              :      * CheckPointSegments.
    3400              :      */
    3401         1443 :     max_segno = logsegno + CheckPointSegments;
    3402         1443 :     if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
    3403              :                                logtli))
    3404              :     {
    3405         1443 :         *added = true;
    3406         1443 :         elog(DEBUG2, "done creating and filling new WAL file");
    3407              :     }
    3408              :     else
    3409              :     {
    3410              :         /*
    3411              :          * No need for any more future segments, or InstallXLogFileSegment()
    3412              :          * failed to rename the file into place. If the rename failed, a
    3413              :          * caller opening the file may fail.
    3414              :          */
    3415            0 :         unlink(tmppath);
    3416            0 :         elog(DEBUG2, "abandoned new WAL file");
    3417              :     }
    3418              : 
    3419         1443 :     return -1;
    3420              : }
    3421              : 
    3422              : /*
    3423              :  * Create a new XLOG file segment, or open a pre-existing one.
    3424              :  *
    3425              :  * logsegno: identify segment to be created/opened.
    3426              :  *
    3427              :  * Returns FD of opened file.
    3428              :  *
    3429              :  * Note: errors here are ERROR not PANIC because we might or might not be
    3430              :  * inside a critical section (eg, during checkpoint there is no reason to
    3431              :  * take down the system on failure).  They will promote to PANIC if we are
    3432              :  * in a critical section.
    3433              :  */
    3434              : int
    3435        16081 : XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
    3436              : {
    3437              :     bool        ignore_added;
    3438              :     char        path[MAXPGPATH];
    3439              :     int         fd;
    3440              : 
    3441              :     Assert(logtli != 0);
    3442              : 
    3443        16081 :     fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
    3444        16081 :     if (fd >= 0)
    3445        14700 :         return fd;
    3446              : 
    3447              :     /* Now open original target segment (might not be file I just made) */
    3448         1381 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    3449         1381 :                        get_sync_bit(wal_sync_method));
    3450         1381 :     if (fd < 0)
    3451            0 :         ereport(ERROR,
    3452              :                 (errcode_for_file_access(),
    3453              :                  errmsg("could not open file \"%s\": %m", path)));
    3454         1381 :     return fd;
    3455              : }
    3456              : 
    3457              : /*
    3458              :  * Create a new XLOG file segment by copying a pre-existing one.
    3459              :  *
    3460              :  * destsegno: identify segment to be created.
    3461              :  *
    3462              :  * srcTLI, srcsegno: identify segment to be copied (could be from
    3463              :  *      a different timeline)
    3464              :  *
    3465              :  * upto: how much of the source file to copy (the rest is filled with
    3466              :  *      zeros)
    3467              :  *
    3468              :  * Currently this is only used during recovery, and so there are no locking
    3469              :  * considerations.  But we should be just as tense as XLogFileInit to avoid
    3470              :  * emplacing a bogus file.
    3471              :  */
    3472              : static void
    3473           43 : XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
    3474              :              TimeLineID srcTLI, XLogSegNo srcsegno,
    3475              :              int upto)
    3476              : {
    3477              :     char        path[MAXPGPATH];
    3478              :     char        tmppath[MAXPGPATH];
    3479              :     PGAlignedXLogBlock buffer;
    3480              :     int         srcfd;
    3481              :     int         fd;
    3482              :     int         nbytes;
    3483              : 
    3484              :     /*
    3485              :      * Open the source file
    3486              :      */
    3487           43 :     XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
    3488           43 :     srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
    3489           43 :     if (srcfd < 0)
    3490            0 :         ereport(ERROR,
    3491              :                 (errcode_for_file_access(),
    3492              :                  errmsg("could not open file \"%s\": %m", path)));
    3493              : 
    3494              :     /*
    3495              :      * Copy into a temp file name.
    3496              :      */
    3497           43 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3498              : 
    3499           43 :     unlink(tmppath);
    3500              : 
    3501              :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    3502           43 :     fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
    3503           43 :     if (fd < 0)
    3504            0 :         ereport(ERROR,
    3505              :                 (errcode_for_file_access(),
    3506              :                  errmsg("could not create file \"%s\": %m", tmppath)));
    3507              : 
    3508              :     /*
    3509              :      * Do the data copying.
    3510              :      */
    3511        88107 :     for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
    3512              :     {
    3513              :         int         nread;
    3514              : 
    3515        88064 :         nread = upto - nbytes;
    3516              : 
    3517              :         /*
    3518              :          * The part that is not read from the source file is filled with
    3519              :          * zeros.
    3520              :          */
    3521        88064 :         if (nread < sizeof(buffer))
    3522           43 :             memset(buffer.data, 0, sizeof(buffer));
    3523              : 
    3524        88064 :         if (nread > 0)
    3525              :         {
    3526              :             int         r;
    3527              : 
    3528         2802 :             if (nread > sizeof(buffer))
    3529         2759 :                 nread = sizeof(buffer);
    3530         2802 :             pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
    3531         2802 :             r = read(srcfd, buffer.data, nread);
    3532         2802 :             if (r != nread)
    3533              :             {
    3534            0 :                 if (r < 0)
    3535            0 :                     ereport(ERROR,
    3536              :                             (errcode_for_file_access(),
    3537              :                              errmsg("could not read file \"%s\": %m",
    3538              :                                     path)));
    3539              :                 else
    3540            0 :                     ereport(ERROR,
    3541              :                             (errcode(ERRCODE_DATA_CORRUPTED),
    3542              :                              errmsg("could not read file \"%s\": read %d of %zu",
    3543              :                                     path, r, (Size) nread)));
    3544              :             }
    3545         2802 :             pgstat_report_wait_end();
    3546              :         }
    3547        88064 :         errno = 0;
    3548        88064 :         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
    3549        88064 :         if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
    3550              :         {
    3551            0 :             int         save_errno = errno;
    3552              : 
    3553              :             /*
    3554              :              * If we fail to make the file, delete it to release disk space
    3555              :              */
    3556            0 :             unlink(tmppath);
    3557              :             /* if write didn't set errno, assume problem is no disk space */
    3558            0 :             errno = save_errno ? save_errno : ENOSPC;
    3559              : 
    3560            0 :             ereport(ERROR,
    3561              :                     (errcode_for_file_access(),
    3562              :                      errmsg("could not write to file \"%s\": %m", tmppath)));
    3563              :         }
    3564        88064 :         pgstat_report_wait_end();
    3565              :     }
    3566              : 
    3567           43 :     pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
    3568           43 :     if (pg_fsync(fd) != 0)
    3569            0 :         ereport(data_sync_elevel(ERROR),
    3570              :                 (errcode_for_file_access(),
    3571              :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3572           43 :     pgstat_report_wait_end();
    3573              : 
    3574           43 :     if (CloseTransientFile(fd) != 0)
    3575            0 :         ereport(ERROR,
    3576              :                 (errcode_for_file_access(),
    3577              :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3578              : 
    3579           43 :     if (CloseTransientFile(srcfd) != 0)
    3580            0 :         ereport(ERROR,
    3581              :                 (errcode_for_file_access(),
    3582              :                  errmsg("could not close file \"%s\": %m", path)));
    3583              : 
    3584              :     /*
    3585              :      * Now move the segment into place with its final name.
    3586              :      */
    3587           43 :     if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
    3588            0 :         elog(ERROR, "InstallXLogFileSegment should not have failed");
    3589           43 : }
    3590              : 
    3591              : /*
    3592              :  * Install a new XLOG segment file as a current or future log segment.
    3593              :  *
    3594              :  * This is used both to install a newly-created segment (which has a temp
    3595              :  * filename while it's being created) and to recycle an old segment.
    3596              :  *
    3597              :  * *segno: identify segment to install as (or first possible target).
    3598              :  * When find_free is true, this is modified on return to indicate the
    3599              :  * actual installation location or last segment searched.
    3600              :  *
    3601              :  * tmppath: initial name of file to install.  It will be renamed into place.
    3602              :  *
    3603              :  * find_free: if true, install the new segment at the first empty segno
    3604              :  * number at or after the passed numbers.  If false, install the new segment
    3605              :  * exactly where specified, deleting any existing segment file there.
    3606              :  *
    3607              :  * max_segno: maximum segment number to install the new file as.  Fail if no
    3608              :  * free slot is found between *segno and max_segno. (Ignored when find_free
    3609              :  * is false.)
    3610              :  *
    3611              :  * tli: The timeline on which the new segment should be installed.
    3612              :  *
    3613              :  * Returns true if the file was installed successfully.  false indicates that
    3614              :  * max_segno limit was exceeded, the startup process has disabled this
    3615              :  * function for now, or an error occurred while renaming the file into place.
    3616              :  */
    3617              : static bool
    3618         3236 : InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
    3619              :                        bool find_free, XLogSegNo max_segno, TimeLineID tli)
    3620              : {
    3621              :     char        path[MAXPGPATH];
    3622              :     struct stat stat_buf;
    3623              : 
    3624              :     Assert(tli != 0);
    3625              : 
    3626         3236 :     XLogFilePath(path, tli, *segno, wal_segment_size);
    3627              : 
    3628         3236 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    3629         3236 :     if (!XLogCtl->InstallXLogFileSegmentActive)
    3630              :     {
    3631            0 :         LWLockRelease(ControlFileLock);
    3632            0 :         return false;
    3633              :     }
    3634              : 
    3635         3236 :     if (!find_free)
    3636              :     {
    3637              :         /* Force installation: get rid of any pre-existing segment file */
    3638           43 :         durable_unlink(path, DEBUG1);
    3639              :     }
    3640              :     else
    3641              :     {
    3642              :         /* Find a free slot to put it in */
    3643         4665 :         while (stat(path, &stat_buf) == 0)
    3644              :         {
    3645         1645 :             if ((*segno) >= max_segno)
    3646              :             {
    3647              :                 /* Failed to find a free slot within specified range */
    3648          173 :                 LWLockRelease(ControlFileLock);
    3649          173 :                 return false;
    3650              :             }
    3651         1472 :             (*segno)++;
    3652         1472 :             XLogFilePath(path, tli, *segno, wal_segment_size);
    3653              :         }
    3654              :     }
    3655              : 
    3656              :     Assert(access(path, F_OK) != 0 && errno == ENOENT);
    3657         3063 :     if (durable_rename(tmppath, path, LOG) != 0)
    3658              :     {
    3659            0 :         LWLockRelease(ControlFileLock);
    3660              :         /* durable_rename already emitted log message */
    3661            0 :         return false;
    3662              :     }
    3663              : 
    3664         3063 :     LWLockRelease(ControlFileLock);
    3665              : 
    3666         3063 :     return true;
    3667              : }
    3668              : 
    3669              : /*
    3670              :  * Open a pre-existing logfile segment for writing.
    3671              :  */
    3672              : int
    3673          129 : XLogFileOpen(XLogSegNo segno, TimeLineID tli)
    3674              : {
    3675              :     char        path[MAXPGPATH];
    3676              :     int         fd;
    3677              : 
    3678          129 :     XLogFilePath(path, tli, segno, wal_segment_size);
    3679              : 
    3680          129 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    3681          129 :                        get_sync_bit(wal_sync_method));
    3682          129 :     if (fd < 0)
    3683            0 :         ereport(PANIC,
    3684              :                 (errcode_for_file_access(),
    3685              :                  errmsg("could not open file \"%s\": %m", path)));
    3686              : 
    3687          129 :     return fd;
    3688              : }
    3689              : 
    3690              : /*
    3691              :  * Close the current logfile segment for writing.
    3692              :  */
    3693              : static void
    3694         7125 : XLogFileClose(void)
    3695              : {
    3696              :     Assert(openLogFile >= 0);
    3697              : 
    3698              :     /*
    3699              :      * WAL segment files will not be re-read in normal operation, so we advise
    3700              :      * the OS to release any cached pages.  But do not do so if WAL archiving
    3701              :      * or streaming is active, because archiver and walsender process could
    3702              :      * use the cache to read the WAL segment.
    3703              :      */
    3704              : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
    3705         7125 :     if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
    3706          250 :         (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
    3707              : #endif
    3708              : 
    3709         7125 :     if (close(openLogFile) != 0)
    3710              :     {
    3711              :         char        xlogfname[MAXFNAMELEN];
    3712            0 :         int         save_errno = errno;
    3713              : 
    3714            0 :         XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
    3715            0 :         errno = save_errno;
    3716            0 :         ereport(PANIC,
    3717              :                 (errcode_for_file_access(),
    3718              :                  errmsg("could not close file \"%s\": %m", xlogfname)));
    3719              :     }
    3720              : 
    3721         7125 :     openLogFile = -1;
    3722         7125 :     ReleaseExternalFD();
    3723         7125 : }
    3724              : 
    3725              : /*
    3726              :  * Preallocate log files beyond the specified log endpoint.
    3727              :  *
    3728              :  * XXX this is currently extremely conservative, since it forces only one
    3729              :  * future log segment to exist, and even that only if we are 75% done with
    3730              :  * the current one.  This is only appropriate for very low-WAL-volume systems.
    3731              :  * High-volume systems will be OK once they've built up a sufficient set of
    3732              :  * recycled log segments, but the startup transient is likely to include
    3733              :  * a lot of segment creations by foreground processes, which is not so good.
    3734              :  *
    3735              :  * XLogFileInitInternal() can ereport(ERROR).  All known causes indicate big
    3736              :  * trouble; for example, a full filesystem is one cause.  The checkpoint WAL
    3737              :  * and/or ControlFile updates already completed.  If a RequestCheckpoint()
    3738              :  * initiated the present checkpoint and an ERROR ends this function, the
    3739              :  * command that called RequestCheckpoint() fails.  That's not ideal, but it's
    3740              :  * not worth contorting more functions to use caller-specified elevel values.
    3741              :  * (With or without RequestCheckpoint(), an ERROR forestalls some inessential
    3742              :  * reporting and resource reclamation.)
    3743              :  */
    3744              : static void
    3745         2209 : PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
    3746              : {
    3747              :     XLogSegNo   _logSegNo;
    3748              :     int         lf;
    3749              :     bool        added;
    3750              :     char        path[MAXPGPATH];
    3751              :     uint64      offset;
    3752              : 
    3753         2209 :     if (!XLogCtl->InstallXLogFileSegmentActive)
    3754           11 :         return;                 /* unlocked check says no */
    3755              : 
    3756         2198 :     XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
    3757         2198 :     offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
    3758         2198 :     if (offset >= (uint32) (0.75 * wal_segment_size))
    3759              :     {
    3760          225 :         _logSegNo++;
    3761          225 :         lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
    3762          225 :         if (lf >= 0)
    3763          163 :             close(lf);
    3764          225 :         if (added)
    3765           62 :             CheckpointStats.ckpt_segs_added++;
    3766              :     }
    3767              : }
    3768              : 
    3769              : /*
    3770              :  * Throws an error if the given log segment has already been removed or
    3771              :  * recycled. The caller should only pass a segment that it knows to have
    3772              :  * existed while the server has been running, as this function always
    3773              :  * succeeds if no WAL segments have been removed since startup.
    3774              :  * 'tli' is only used in the error message.
    3775              :  *
    3776              :  * Note: this function guarantees to keep errno unchanged on return.
    3777              :  * This supports callers that use this to possibly deliver a better
    3778              :  * error message about a missing file, while still being able to throw
    3779              :  * a normal file-access error afterwards, if this does return.
    3780              :  */
    3781              : void
    3782       127743 : CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
    3783              : {
    3784       127743 :     int         save_errno = errno;
    3785              :     XLogSegNo   lastRemovedSegNo;
    3786              : 
    3787       127743 :     SpinLockAcquire(&XLogCtl->info_lck);
    3788       127743 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3789       127743 :     SpinLockRelease(&XLogCtl->info_lck);
    3790              : 
    3791       127743 :     if (segno <= lastRemovedSegNo)
    3792              :     {
    3793              :         char        filename[MAXFNAMELEN];
    3794              : 
    3795            0 :         XLogFileName(filename, tli, segno, wal_segment_size);
    3796            0 :         errno = save_errno;
    3797            0 :         ereport(ERROR,
    3798              :                 (errcode_for_file_access(),
    3799              :                  errmsg("requested WAL segment %s has already been removed",
    3800              :                         filename)));
    3801              :     }
    3802       127743 :     errno = save_errno;
    3803       127743 : }
    3804              : 
    3805              : /*
    3806              :  * Return the last WAL segment removed, or 0 if no segment has been removed
    3807              :  * since startup.
    3808              :  *
    3809              :  * NB: the result can be out of date arbitrarily fast, the caller has to deal
    3810              :  * with that.
    3811              :  */
    3812              : XLogSegNo
    3813         1264 : XLogGetLastRemovedSegno(void)
    3814              : {
    3815              :     XLogSegNo   lastRemovedSegNo;
    3816              : 
    3817         1264 :     SpinLockAcquire(&XLogCtl->info_lck);
    3818         1264 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3819         1264 :     SpinLockRelease(&XLogCtl->info_lck);
    3820              : 
    3821         1264 :     return lastRemovedSegNo;
    3822              : }
    3823              : 
    3824              : /*
    3825              :  * Return the oldest WAL segment on the given TLI that still exists in
    3826              :  * XLOGDIR, or 0 if none.
    3827              :  */
    3828              : XLogSegNo
    3829            7 : XLogGetOldestSegno(TimeLineID tli)
    3830              : {
    3831              :     DIR        *xldir;
    3832              :     struct dirent *xlde;
    3833            7 :     XLogSegNo   oldest_segno = 0;
    3834              : 
    3835            7 :     xldir = AllocateDir(XLOGDIR);
    3836           50 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3837              :     {
    3838              :         TimeLineID  file_tli;
    3839              :         XLogSegNo   file_segno;
    3840              : 
    3841              :         /* Ignore files that are not XLOG segments. */
    3842           43 :         if (!IsXLogFileName(xlde->d_name))
    3843           29 :             continue;
    3844              : 
    3845              :         /* Parse filename to get TLI and segno. */
    3846           14 :         XLogFromFileName(xlde->d_name, &file_tli, &file_segno,
    3847              :                          wal_segment_size);
    3848              : 
    3849              :         /* Ignore anything that's not from the TLI of interest. */
    3850           14 :         if (tli != file_tli)
    3851            0 :             continue;
    3852              : 
    3853              :         /* If it's the oldest so far, update oldest_segno. */
    3854           14 :         if (oldest_segno == 0 || file_segno < oldest_segno)
    3855            9 :             oldest_segno = file_segno;
    3856              :     }
    3857              : 
    3858            7 :     FreeDir(xldir);
    3859            7 :     return oldest_segno;
    3860              : }
    3861              : 
    3862              : /*
    3863              :  * Update the last removed segno pointer in shared memory, to reflect that the
    3864              :  * given XLOG file has been removed.
    3865              :  */
    3866              : static void
    3867         2779 : UpdateLastRemovedPtr(char *filename)
    3868              : {
    3869              :     uint32      tli;
    3870              :     XLogSegNo   segno;
    3871              : 
    3872         2779 :     XLogFromFileName(filename, &tli, &segno, wal_segment_size);
    3873              : 
    3874         2779 :     SpinLockAcquire(&XLogCtl->info_lck);
    3875         2779 :     if (segno > XLogCtl->lastRemovedSegNo)
    3876         1265 :         XLogCtl->lastRemovedSegNo = segno;
    3877         2779 :     SpinLockRelease(&XLogCtl->info_lck);
    3878         2779 : }
    3879              : 
    3880              : /*
    3881              :  * Remove all temporary log files in pg_wal
    3882              :  *
    3883              :  * This is called at the beginning of recovery after a previous crash,
    3884              :  * at a point where no other processes write fresh WAL data.
    3885              :  */
    3886              : static void
    3887          188 : RemoveTempXlogFiles(void)
    3888              : {
    3889              :     DIR        *xldir;
    3890              :     struct dirent *xlde;
    3891              : 
    3892          188 :     elog(DEBUG2, "removing all temporary WAL segments");
    3893              : 
    3894          188 :     xldir = AllocateDir(XLOGDIR);
    3895         1271 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3896              :     {
    3897              :         char        path[MAXPGPATH];
    3898              : 
    3899         1083 :         if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
    3900         1083 :             continue;
    3901              : 
    3902            0 :         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
    3903            0 :         unlink(path);
    3904            0 :         elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
    3905              :     }
    3906          188 :     FreeDir(xldir);
    3907          188 : }
    3908              : 
    3909              : /*
    3910              :  * Recycle or remove all log files older or equal to passed segno.
    3911              :  *
    3912              :  * endptr is current (or recent) end of xlog, and lastredoptr is the
    3913              :  * redo pointer of the last checkpoint. These are used to determine
    3914              :  * whether we want to recycle rather than delete no-longer-wanted log files.
    3915              :  *
    3916              :  * insertTLI is the current timeline for XLOG insertion. Any recycled
    3917              :  * segments should be reused for this timeline.
    3918              :  */
    3919              : static void
    3920         1931 : RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
    3921              :                    TimeLineID insertTLI)
    3922              : {
    3923              :     DIR        *xldir;
    3924              :     struct dirent *xlde;
    3925              :     char        lastoff[MAXFNAMELEN];
    3926              :     XLogSegNo   endlogSegNo;
    3927              :     XLogSegNo   recycleSegNo;
    3928              : 
    3929              :     /* Initialize info about where to try to recycle to */
    3930         1931 :     XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
    3931         1931 :     recycleSegNo = XLOGfileslop(lastredoptr);
    3932              : 
    3933              :     /*
    3934              :      * Construct a filename of the last segment to be kept. The timeline ID
    3935              :      * doesn't matter, we ignore that in the comparison. (During recovery,
    3936              :      * InsertTimeLineID isn't set, so we can't use that.)
    3937              :      */
    3938         1931 :     XLogFileName(lastoff, 0, segno, wal_segment_size);
    3939              : 
    3940         1931 :     elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
    3941              :          lastoff);
    3942              : 
    3943         1931 :     xldir = AllocateDir(XLOGDIR);
    3944              : 
    3945        29237 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3946              :     {
    3947              :         /* Ignore files that are not XLOG segments */
    3948        27306 :         if (!IsXLogFileName(xlde->d_name) &&
    3949         8171 :             !IsPartialXLogFileName(xlde->d_name))
    3950         8169 :             continue;
    3951              : 
    3952              :         /*
    3953              :          * We ignore the timeline part of the XLOG segment identifiers in
    3954              :          * deciding whether a segment is still needed.  This ensures that we
    3955              :          * won't prematurely remove a segment from a parent timeline. We could
    3956              :          * probably be a little more proactive about removing segments of
    3957              :          * non-parent timelines, but that would be a whole lot more
    3958              :          * complicated.
    3959              :          *
    3960              :          * We use the alphanumeric sorting property of the filenames to decide
    3961              :          * which ones are earlier than the lastoff segment.
    3962              :          */
    3963        19137 :         if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
    3964              :         {
    3965        12241 :             if (XLogArchiveCheckDone(xlde->d_name))
    3966              :             {
    3967              :                 /* Update the last removed location in shared memory first */
    3968         2779 :                 UpdateLastRemovedPtr(xlde->d_name);
    3969              : 
    3970         2779 :                 RemoveXlogFile(xlde, recycleSegNo, &endlogSegNo, insertTLI);
    3971              :             }
    3972              :         }
    3973              :     }
    3974              : 
    3975         1931 :     FreeDir(xldir);
    3976         1931 : }
    3977              : 
    3978              : /*
    3979              :  * Recycle or remove WAL files that are not part of the given timeline's
    3980              :  * history.
    3981              :  *
    3982              :  * This is called during recovery, whenever we switch to follow a new
    3983              :  * timeline, and at the end of recovery when we create a new timeline. We
    3984              :  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
    3985              :  * might be leftover pre-allocated or recycled WAL segments on the old timeline
    3986              :  * that we haven't used yet, and contain garbage. If we just leave them in
    3987              :  * pg_wal, they will eventually be archived, and we can't let that happen.
    3988              :  * Files that belong to our timeline history are valid, because we have
    3989              :  * successfully replayed them, but from others we can't be sure.
    3990              :  *
    3991              :  * 'switchpoint' is the current point in WAL where we switch to new timeline,
    3992              :  * and 'newTLI' is the new timeline we switch to.
    3993              :  */
    3994              : void
    3995           67 : RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
    3996              : {
    3997              :     DIR        *xldir;
    3998              :     struct dirent *xlde;
    3999              :     char        switchseg[MAXFNAMELEN];
    4000              :     XLogSegNo   endLogSegNo;
    4001              :     XLogSegNo   switchLogSegNo;
    4002              :     XLogSegNo   recycleSegNo;
    4003              : 
    4004              :     /*
    4005              :      * Initialize info about where to begin the work.  This will recycle,
    4006              :      * somewhat arbitrarily, 10 future segments.
    4007              :      */
    4008           67 :     XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
    4009           67 :     XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
    4010           67 :     recycleSegNo = endLogSegNo + 10;
    4011              : 
    4012              :     /*
    4013              :      * Construct a filename of the last segment to be kept.
    4014              :      */
    4015           67 :     XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
    4016              : 
    4017           67 :     elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
    4018              :          switchseg);
    4019              : 
    4020           67 :     xldir = AllocateDir(XLOGDIR);
    4021              : 
    4022          636 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4023              :     {
    4024              :         /* Ignore files that are not XLOG segments */
    4025          569 :         if (!IsXLogFileName(xlde->d_name))
    4026          353 :             continue;
    4027              : 
    4028              :         /*
    4029              :          * Remove files that are on a timeline older than the new one we're
    4030              :          * switching to, but with a segment number >= the first segment on the
    4031              :          * new timeline.
    4032              :          */
    4033          216 :         if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
    4034          140 :             strcmp(xlde->d_name + 8, switchseg + 8) > 0)
    4035              :         {
    4036              :             /*
    4037              :              * If the file has already been marked as .ready, however, don't
    4038              :              * remove it yet. It should be OK to remove it - files that are
    4039              :              * not part of our timeline history are not required for recovery
    4040              :              * - but seems safer to let them be archived and removed later.
    4041              :              */
    4042           17 :             if (!XLogArchiveIsReady(xlde->d_name))
    4043           17 :                 RemoveXlogFile(xlde, recycleSegNo, &endLogSegNo, newTLI);
    4044              :         }
    4045              :     }
    4046              : 
    4047           67 :     FreeDir(xldir);
    4048           67 : }
    4049              : 
    4050              : /*
    4051              :  * Recycle or remove a log file that's no longer needed.
    4052              :  *
    4053              :  * segment_de is the dirent structure of the segment to recycle or remove.
    4054              :  * recycleSegNo is the segment number to recycle up to.  endlogSegNo is
    4055              :  * the segment number of the current (or recent) end of WAL.
    4056              :  *
    4057              :  * endlogSegNo gets incremented if the segment is recycled so as it is not
    4058              :  * checked again with future callers of this function.
    4059              :  *
    4060              :  * insertTLI is the current timeline for XLOG insertion. Any recycled segments
    4061              :  * should be used for this timeline.
    4062              :  */
    4063              : static void
    4064         2796 : RemoveXlogFile(const struct dirent *segment_de,
    4065              :                XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
    4066              :                TimeLineID insertTLI)
    4067              : {
    4068              :     char        path[MAXPGPATH];
    4069              : #ifdef WIN32
    4070              :     char        newpath[MAXPGPATH];
    4071              : #endif
    4072         2796 :     const char *segname = segment_de->d_name;
    4073              : 
    4074         2796 :     snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
    4075              : 
    4076              :     /*
    4077              :      * Before deleting the file, see if it can be recycled as a future log
    4078              :      * segment. Only recycle normal files, because we don't want to recycle
    4079              :      * symbolic links pointing to a separate archive directory.
    4080              :      */
    4081         2796 :     if (wal_recycle &&
    4082         2796 :         *endlogSegNo <= recycleSegNo &&
    4083         3828 :         XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */
    4084         3500 :         get_dirent_type(path, segment_de, false, DEBUG2) == PGFILETYPE_REG &&
    4085         1750 :         InstallXLogFileSegment(endlogSegNo, path,
    4086              :                                true, recycleSegNo, insertTLI))
    4087              :     {
    4088         1577 :         ereport(DEBUG2,
    4089              :                 (errmsg_internal("recycled write-ahead log file \"%s\"",
    4090              :                                  segname)));
    4091         1577 :         CheckpointStats.ckpt_segs_recycled++;
    4092              :         /* Needn't recheck that slot on future iterations */
    4093         1577 :         (*endlogSegNo)++;
    4094              :     }
    4095              :     else
    4096              :     {
    4097              :         /* No need for any more future segments, or recycling failed ... */
    4098              :         int         rc;
    4099              : 
    4100         1219 :         ereport(DEBUG2,
    4101              :                 (errmsg_internal("removing write-ahead log file \"%s\"",
    4102              :                                  segname)));
    4103              : 
    4104              : #ifdef WIN32
    4105              : 
    4106              :         /*
    4107              :          * On Windows, if another process (e.g another backend) holds the file
    4108              :          * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
    4109              :          * will still show up in directory listing until the last handle is
    4110              :          * closed. To avoid confusing the lingering deleted file for a live
    4111              :          * WAL file that needs to be archived, rename it before deleting it.
    4112              :          *
    4113              :          * If another process holds the file open without FILE_SHARE_DELETE
    4114              :          * flag, rename will fail. We'll try again at the next checkpoint.
    4115              :          */
    4116              :         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
    4117              :         if (rename(path, newpath) != 0)
    4118              :         {
    4119              :             ereport(LOG,
    4120              :                     (errcode_for_file_access(),
    4121              :                      errmsg("could not rename file \"%s\": %m",
    4122              :                             path)));
    4123              :             return;
    4124              :         }
    4125              :         rc = durable_unlink(newpath, LOG);
    4126              : #else
    4127         1219 :         rc = durable_unlink(path, LOG);
    4128              : #endif
    4129         1219 :         if (rc != 0)
    4130              :         {
    4131              :             /* Message already logged by durable_unlink() */
    4132            0 :             return;
    4133              :         }
    4134         1219 :         CheckpointStats.ckpt_segs_removed++;
    4135              :     }
    4136              : 
    4137         2796 :     XLogArchiveCleanup(segname);
    4138              : }
    4139              : 
    4140              : /*
    4141              :  * Verify whether pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
    4142              :  * If the latter do not exist, recreate them.
    4143              :  *
    4144              :  * It is not the goal of this function to verify the contents of these
    4145              :  * directories, but to help in cases where someone has performed a cluster
    4146              :  * copy for PITR purposes but omitted pg_wal from the copy.
    4147              :  *
    4148              :  * We could also recreate pg_wal if it doesn't exist, but a deliberate
    4149              :  * policy decision was made not to.  It is fairly common for pg_wal to be
    4150              :  * a symlink, and if that was the DBA's intent then automatically making a
    4151              :  * plain directory would result in degraded performance with no notice.
    4152              :  */
    4153              : static void
    4154         1074 : ValidateXLOGDirectoryStructure(void)
    4155              : {
    4156              :     char        path[MAXPGPATH];
    4157              :     struct stat stat_buf;
    4158              : 
    4159              :     /* Check for pg_wal; if it doesn't exist, error out */
    4160         1074 :     if (stat(XLOGDIR, &stat_buf) != 0 ||
    4161         1074 :         !S_ISDIR(stat_buf.st_mode))
    4162            0 :         ereport(FATAL,
    4163              :                 (errcode_for_file_access(),
    4164              :                  errmsg("required WAL directory \"%s\" does not exist",
    4165              :                         XLOGDIR)));
    4166              : 
    4167              :     /* Check for archive_status */
    4168         1074 :     snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
    4169         1074 :     if (stat(path, &stat_buf) == 0)
    4170              :     {
    4171              :         /* Check for weird cases where it exists but isn't a directory */
    4172         1073 :         if (!S_ISDIR(stat_buf.st_mode))
    4173            0 :             ereport(FATAL,
    4174              :                     (errcode_for_file_access(),
    4175              :                      errmsg("required WAL directory \"%s\" does not exist",
    4176              :                             path)));
    4177              :     }
    4178              :     else
    4179              :     {
    4180            1 :         ereport(LOG,
    4181              :                 (errmsg("creating missing WAL directory \"%s\"", path)));
    4182            1 :         if (MakePGDirectory(path) < 0)
    4183            0 :             ereport(FATAL,
    4184              :                     (errcode_for_file_access(),
    4185              :                      errmsg("could not create missing directory \"%s\": %m",
    4186              :                             path)));
    4187              :     }
    4188              : 
    4189              :     /* Check for summaries */
    4190         1074 :     snprintf(path, MAXPGPATH, XLOGDIR "/summaries");
    4191         1074 :     if (stat(path, &stat_buf) == 0)
    4192              :     {
    4193              :         /* Check for weird cases where it exists but isn't a directory */
    4194         1073 :         if (!S_ISDIR(stat_buf.st_mode))
    4195            0 :             ereport(FATAL,
    4196              :                     (errmsg("required WAL directory \"%s\" does not exist",
    4197              :                             path)));
    4198              :     }
    4199              :     else
    4200              :     {
    4201            1 :         ereport(LOG,
    4202              :                 (errmsg("creating missing WAL directory \"%s\"", path)));
    4203            1 :         if (MakePGDirectory(path) < 0)
    4204            0 :             ereport(FATAL,
    4205              :                     (errmsg("could not create missing directory \"%s\": %m",
    4206              :                             path)));
    4207              :     }
    4208         1074 : }
    4209              : 
    4210              : /*
    4211              :  * Remove previous backup history files.  This also retries creation of
    4212              :  * .ready files for any backup history files for which XLogArchiveNotify
    4213              :  * failed earlier.
    4214              :  */
    4215              : static void
    4216          162 : CleanupBackupHistory(void)
    4217              : {
    4218              :     DIR        *xldir;
    4219              :     struct dirent *xlde;
    4220              :     char        path[MAXPGPATH + sizeof(XLOGDIR)];
    4221              : 
    4222          162 :     xldir = AllocateDir(XLOGDIR);
    4223              : 
    4224         1669 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4225              :     {
    4226         1345 :         if (IsBackupHistoryFileName(xlde->d_name))
    4227              :         {
    4228          172 :             if (XLogArchiveCheckDone(xlde->d_name))
    4229              :             {
    4230          136 :                 elog(DEBUG2, "removing WAL backup history file \"%s\"",
    4231              :                      xlde->d_name);
    4232          136 :                 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
    4233          136 :                 unlink(path);
    4234          136 :                 XLogArchiveCleanup(xlde->d_name);
    4235              :             }
    4236              :         }
    4237              :     }
    4238              : 
    4239          162 :     FreeDir(xldir);
    4240          162 : }
    4241              : 
    4242              : /*
    4243              :  * I/O routines for pg_control
    4244              :  *
    4245              :  * *ControlFile is a buffer in shared memory that holds an image of the
    4246              :  * contents of pg_control.  WriteControlFile() initializes pg_control
    4247              :  * given a preloaded buffer, ReadControlFile() loads the buffer from
    4248              :  * the pg_control file (during postmaster or standalone-backend startup),
    4249              :  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
    4250              :  * InitControlFile() fills the buffer with initial values.
    4251              :  *
    4252              :  * For simplicity, WriteControlFile() initializes the fields of pg_control
    4253              :  * that are related to checking backend/database compatibility, and
    4254              :  * ReadControlFile() verifies they are correct.  We could split out the
    4255              :  * I/O and compatibility-check functions, but there seems no need currently.
    4256              :  */
    4257              : 
    4258              : static void
    4259           57 : InitControlFile(uint64 sysidentifier, uint32 data_checksum_version)
    4260              : {
    4261              :     char        mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
    4262              : 
    4263              :     /*
    4264              :      * Generate a random nonce. This is used for authentication requests that
    4265              :      * will fail because the user does not exist. The nonce is used to create
    4266              :      * a genuine-looking password challenge for the non-existent user, in lieu
    4267              :      * of an actual stored password.
    4268              :      */
    4269           57 :     if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
    4270            0 :         ereport(PANIC,
    4271              :                 (errcode(ERRCODE_INTERNAL_ERROR),
    4272              :                  errmsg("could not generate secret authorization token")));
    4273              : 
    4274           57 :     memset(ControlFile, 0, sizeof(ControlFileData));
    4275              :     /* Initialize pg_control status fields */
    4276           57 :     ControlFile->system_identifier = sysidentifier;
    4277           57 :     memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
    4278           57 :     ControlFile->state = DB_SHUTDOWNED;
    4279           57 :     ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
    4280              : 
    4281              :     /* Set important parameter values for use when replaying WAL */
    4282           57 :     ControlFile->MaxConnections = MaxConnections;
    4283           57 :     ControlFile->max_worker_processes = max_worker_processes;
    4284           57 :     ControlFile->max_wal_senders = max_wal_senders;
    4285           57 :     ControlFile->max_prepared_xacts = max_prepared_xacts;
    4286           57 :     ControlFile->max_locks_per_xact = max_locks_per_xact;
    4287           57 :     ControlFile->wal_level = wal_level;
    4288           57 :     ControlFile->wal_log_hints = wal_log_hints;
    4289           57 :     ControlFile->track_commit_timestamp = track_commit_timestamp;
    4290           57 :     ControlFile->data_checksum_version = data_checksum_version;
    4291              : 
    4292              :     /*
    4293              :      * Set the data_checksum_version value into XLogCtl, which is where all
    4294              :      * processes get the current value from.
    4295              :      */
    4296           57 :     XLogCtl->data_checksum_version = data_checksum_version;
    4297           57 : }
    4298              : 
    4299              : static void
    4300           57 : WriteControlFile(void)
    4301              : {
    4302              :     int         fd;
    4303              :     char        buffer[PG_CONTROL_FILE_SIZE];   /* need not be aligned */
    4304              : 
    4305              :     /*
    4306              :      * Initialize version and compatibility-check fields
    4307              :      */
    4308           57 :     ControlFile->pg_control_version = PG_CONTROL_VERSION;
    4309           57 :     ControlFile->catalog_version_no = CATALOG_VERSION_NO;
    4310              : 
    4311           57 :     ControlFile->maxAlign = MAXIMUM_ALIGNOF;
    4312           57 :     ControlFile->floatFormat = FLOATFORMAT_VALUE;
    4313              : 
    4314           57 :     ControlFile->blcksz = BLCKSZ;
    4315           57 :     ControlFile->relseg_size = RELSEG_SIZE;
    4316           57 :     ControlFile->slru_pages_per_segment = SLRU_PAGES_PER_SEGMENT;
    4317           57 :     ControlFile->xlog_blcksz = XLOG_BLCKSZ;
    4318           57 :     ControlFile->xlog_seg_size = wal_segment_size;
    4319              : 
    4320           57 :     ControlFile->nameDataLen = NAMEDATALEN;
    4321           57 :     ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
    4322              : 
    4323           57 :     ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
    4324           57 :     ControlFile->loblksize = LOBLKSIZE;
    4325              : 
    4326           57 :     ControlFile->float8ByVal = true; /* vestigial */
    4327              : 
    4328              :     /*
    4329              :      * Initialize the default 'char' signedness.
    4330              :      *
    4331              :      * The signedness of the char type is implementation-defined. For instance
    4332              :      * on x86 architecture CPUs, the char data type is typically treated as
    4333              :      * signed by default, whereas on aarch architecture CPUs, it is typically
    4334              :      * treated as unsigned by default. In v17 or earlier, we accidentally let
    4335              :      * C implementation signedness affect persistent data. This led to
    4336              :      * inconsistent results when comparing char data across different
    4337              :      * platforms.
    4338              :      *
    4339              :      * This flag can be used as a hint to ensure consistent behavior for
    4340              :      * pre-v18 data files that store data sorted by the 'char' type on disk,
    4341              :      * especially in cross-platform replication scenarios.
    4342              :      *
    4343              :      * Newly created database clusters unconditionally set the default char
    4344              :      * signedness to true. pg_upgrade changes this flag for clusters that were
    4345              :      * initialized on signedness=false platforms. As a result,
    4346              :      * signedness=false setting will become rare over time. If we had known
    4347              :      * about this problem during the last development cycle that forced initdb
    4348              :      * (v8.3), we would have made all clusters signed or all clusters
    4349              :      * unsigned. Making pg_upgrade the only source of signedness=false will
    4350              :      * cause the population of database clusters to converge toward that
    4351              :      * retrospective ideal.
    4352              :      */
    4353           57 :     ControlFile->default_char_signedness = true;
    4354              : 
    4355              :     /* Contents are protected with a CRC */
    4356           57 :     INIT_CRC32C(ControlFile->crc);
    4357           57 :     COMP_CRC32C(ControlFile->crc,
    4358              :                 ControlFile,
    4359              :                 offsetof(ControlFileData, crc));
    4360           57 :     FIN_CRC32C(ControlFile->crc);
    4361              : 
    4362              :     /*
    4363              :      * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
    4364              :      * the excess over sizeof(ControlFileData).  This reduces the odds of
    4365              :      * premature-EOF errors when reading pg_control.  We'll still fail when we
    4366              :      * check the contents of the file, but hopefully with a more specific
    4367              :      * error than "couldn't read pg_control".
    4368              :      */
    4369           57 :     memset(buffer, 0, PG_CONTROL_FILE_SIZE);
    4370           57 :     memcpy(buffer, ControlFile, sizeof(ControlFileData));
    4371              : 
    4372           57 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4373              :                        O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
    4374           57 :     if (fd < 0)
    4375            0 :         ereport(PANIC,
    4376              :                 (errcode_for_file_access(),
    4377              :                  errmsg("could not create file \"%s\": %m",
    4378              :                         XLOG_CONTROL_FILE)));
    4379              : 
    4380           57 :     errno = 0;
    4381           57 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
    4382           57 :     if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
    4383              :     {
    4384              :         /* if write didn't set errno, assume problem is no disk space */
    4385            0 :         if (errno == 0)
    4386            0 :             errno = ENOSPC;
    4387            0 :         ereport(PANIC,
    4388              :                 (errcode_for_file_access(),
    4389              :                  errmsg("could not write to file \"%s\": %m",
    4390              :                         XLOG_CONTROL_FILE)));
    4391              :     }
    4392           57 :     pgstat_report_wait_end();
    4393              : 
    4394           57 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
    4395           57 :     if (pg_fsync(fd) != 0)
    4396            0 :         ereport(PANIC,
    4397              :                 (errcode_for_file_access(),
    4398              :                  errmsg("could not fsync file \"%s\": %m",
    4399              :                         XLOG_CONTROL_FILE)));
    4400           57 :     pgstat_report_wait_end();
    4401              : 
    4402           57 :     if (close(fd) != 0)
    4403            0 :         ereport(PANIC,
    4404              :                 (errcode_for_file_access(),
    4405              :                  errmsg("could not close file \"%s\": %m",
    4406              :                         XLOG_CONTROL_FILE)));
    4407           57 : }
    4408              : 
    4409              : static void
    4410         1133 : ReadControlFile(void)
    4411              : {
    4412              :     pg_crc32c   crc;
    4413              :     int         fd;
    4414              :     char        wal_segsz_str[20];
    4415              :     int         r;
    4416              : 
    4417              :     /*
    4418              :      * Read data...
    4419              :      */
    4420         1133 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4421              :                        O_RDWR | PG_BINARY);
    4422         1133 :     if (fd < 0)
    4423            0 :         ereport(PANIC,
    4424              :                 (errcode_for_file_access(),
    4425              :                  errmsg("could not open file \"%s\": %m",
    4426              :                         XLOG_CONTROL_FILE)));
    4427              : 
    4428         1133 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
    4429         1133 :     r = read(fd, ControlFile, sizeof(ControlFileData));
    4430         1133 :     if (r != sizeof(ControlFileData))
    4431              :     {
    4432            0 :         if (r < 0)
    4433            0 :             ereport(PANIC,
    4434              :                     (errcode_for_file_access(),
    4435              :                      errmsg("could not read file \"%s\": %m",
    4436              :                             XLOG_CONTROL_FILE)));
    4437              :         else
    4438            0 :             ereport(PANIC,
    4439              :                     (errcode(ERRCODE_DATA_CORRUPTED),
    4440              :                      errmsg("could not read file \"%s\": read %d of %zu",
    4441              :                             XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
    4442              :     }
    4443         1133 :     pgstat_report_wait_end();
    4444              : 
    4445         1133 :     close(fd);
    4446              : 
    4447              :     /*
    4448              :      * Check for expected pg_control format version.  If this is wrong, the
    4449              :      * CRC check will likely fail because we'll be checking the wrong number
    4450              :      * of bytes.  Complaining about wrong version will probably be more
    4451              :      * enlightening than complaining about wrong CRC.
    4452              :      */
    4453              : 
    4454         1133 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
    4455            0 :         ereport(FATAL,
    4456              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4457              :                  errmsg("database files are incompatible with server"),
    4458              :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
    4459              :                            " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
    4460              :                            ControlFile->pg_control_version, ControlFile->pg_control_version,
    4461              :                            PG_CONTROL_VERSION, PG_CONTROL_VERSION),
    4462              :                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
    4463              : 
    4464         1133 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
    4465            0 :         ereport(FATAL,
    4466              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4467              :                  errmsg("database files are incompatible with server"),
    4468              :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
    4469              :                            " but the server was compiled with PG_CONTROL_VERSION %d.",
    4470              :                            ControlFile->pg_control_version, PG_CONTROL_VERSION),
    4471              :                  errhint("It looks like you need to initdb.")));
    4472              : 
    4473              :     /* Now check the CRC. */
    4474         1133 :     INIT_CRC32C(crc);
    4475         1133 :     COMP_CRC32C(crc,
    4476              :                 ControlFile,
    4477              :                 offsetof(ControlFileData, crc));
    4478         1133 :     FIN_CRC32C(crc);
    4479              : 
    4480         1133 :     if (!EQ_CRC32C(crc, ControlFile->crc))
    4481            0 :         ereport(FATAL,
    4482              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4483              :                  errmsg("incorrect checksum in control file")));
    4484              : 
    4485              :     /*
    4486              :      * Do compatibility checking immediately.  If the database isn't
    4487              :      * compatible with the backend executable, we want to abort before we can
    4488              :      * possibly do any damage.
    4489              :      */
    4490         1133 :     if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
    4491            0 :         ereport(FATAL,
    4492              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4493              :                  errmsg("database files are incompatible with server"),
    4494              :         /* translator: %s is a variable name and %d is its value */
    4495              :                  errdetail("The database cluster was initialized with %s %d,"
    4496              :                            " but the server was compiled with %s %d.",
    4497              :                            "CATALOG_VERSION_NO", ControlFile->catalog_version_no,
    4498              :                            "CATALOG_VERSION_NO", CATALOG_VERSION_NO),
    4499              :                  errhint("It looks like you need to initdb.")));
    4500         1133 :     if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
    4501            0 :         ereport(FATAL,
    4502              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4503              :                  errmsg("database files are incompatible with server"),
    4504              :         /* translator: %s is a variable name and %d is its value */
    4505              :                  errdetail("The database cluster was initialized with %s %d,"
    4506              :                            " but the server was compiled with %s %d.",
    4507              :                            "MAXALIGN", ControlFile->maxAlign,
    4508              :                            "MAXALIGN", MAXIMUM_ALIGNOF),
    4509              :                  errhint("It looks like you need to initdb.")));
    4510         1133 :     if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
    4511            0 :         ereport(FATAL,
    4512              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4513              :                  errmsg("database files are incompatible with server"),
    4514              :                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
    4515              :                  errhint("It looks like you need to initdb.")));
    4516         1133 :     if (ControlFile->blcksz != BLCKSZ)
    4517            0 :         ereport(FATAL,
    4518              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4519              :                  errmsg("database files are incompatible with server"),
    4520              :         /* translator: %s is a variable name and %d is its value */
    4521              :                  errdetail("The database cluster was initialized with %s %d,"
    4522              :                            " but the server was compiled with %s %d.",
    4523              :                            "BLCKSZ", ControlFile->blcksz,
    4524              :                            "BLCKSZ", BLCKSZ),
    4525              :                  errhint("It looks like you need to recompile or initdb.")));
    4526         1133 :     if (ControlFile->relseg_size != RELSEG_SIZE)
    4527            0 :         ereport(FATAL,
    4528              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4529              :                  errmsg("database files are incompatible with server"),
    4530              :         /* translator: %s is a variable name and %d is its value */
    4531              :                  errdetail("The database cluster was initialized with %s %d,"
    4532              :                            " but the server was compiled with %s %d.",
    4533              :                            "RELSEG_SIZE", ControlFile->relseg_size,
    4534              :                            "RELSEG_SIZE", RELSEG_SIZE),
    4535              :                  errhint("It looks like you need to recompile or initdb.")));
    4536         1133 :     if (ControlFile->slru_pages_per_segment != SLRU_PAGES_PER_SEGMENT)
    4537            0 :         ereport(FATAL,
    4538              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4539              :                  errmsg("database files are incompatible with server"),
    4540              :         /* translator: %s is a variable name and %d is its value */
    4541              :                  errdetail("The database cluster was initialized with %s %d,"
    4542              :                            " but the server was compiled with %s %d.",
    4543              :                            "SLRU_PAGES_PER_SEGMENT", ControlFile->slru_pages_per_segment,
    4544              :                            "SLRU_PAGES_PER_SEGMENT", SLRU_PAGES_PER_SEGMENT),
    4545              :                  errhint("It looks like you need to recompile or initdb.")));
    4546         1133 :     if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
    4547            0 :         ereport(FATAL,
    4548              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4549              :                  errmsg("database files are incompatible with server"),
    4550              :         /* translator: %s is a variable name and %d is its value */
    4551              :                  errdetail("The database cluster was initialized with %s %d,"
    4552              :                            " but the server was compiled with %s %d.",
    4553              :                            "XLOG_BLCKSZ", ControlFile->xlog_blcksz,
    4554              :                            "XLOG_BLCKSZ", XLOG_BLCKSZ),
    4555              :                  errhint("It looks like you need to recompile or initdb.")));
    4556         1133 :     if (ControlFile->nameDataLen != NAMEDATALEN)
    4557            0 :         ereport(FATAL,
    4558              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4559              :                  errmsg("database files are incompatible with server"),
    4560              :         /* translator: %s is a variable name and %d is its value */
    4561              :                  errdetail("The database cluster was initialized with %s %d,"
    4562              :                            " but the server was compiled with %s %d.",
    4563              :                            "NAMEDATALEN", ControlFile->nameDataLen,
    4564              :                            "NAMEDATALEN", NAMEDATALEN),
    4565              :                  errhint("It looks like you need to recompile or initdb.")));
    4566         1133 :     if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
    4567            0 :         ereport(FATAL,
    4568              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4569              :                  errmsg("database files are incompatible with server"),
    4570              :         /* translator: %s is a variable name and %d is its value */
    4571              :                  errdetail("The database cluster was initialized with %s %d,"
    4572              :                            " but the server was compiled with %s %d.",
    4573              :                            "INDEX_MAX_KEYS", ControlFile->indexMaxKeys,
    4574              :                            "INDEX_MAX_KEYS", INDEX_MAX_KEYS),
    4575              :                  errhint("It looks like you need to recompile or initdb.")));
    4576         1133 :     if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
    4577            0 :         ereport(FATAL,
    4578              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4579              :                  errmsg("database files are incompatible with server"),
    4580              :         /* translator: %s is a variable name and %d is its value */
    4581              :                  errdetail("The database cluster was initialized with %s %d,"
    4582              :                            " but the server was compiled with %s %d.",
    4583              :                            "TOAST_MAX_CHUNK_SIZE", ControlFile->toast_max_chunk_size,
    4584              :                            "TOAST_MAX_CHUNK_SIZE", (int) TOAST_MAX_CHUNK_SIZE),
    4585              :                  errhint("It looks like you need to recompile or initdb.")));
    4586         1133 :     if (ControlFile->loblksize != LOBLKSIZE)
    4587            0 :         ereport(FATAL,
    4588              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4589              :                  errmsg("database files are incompatible with server"),
    4590              :         /* translator: %s is a variable name and %d is its value */
    4591              :                  errdetail("The database cluster was initialized with %s %d,"
    4592              :                            " but the server was compiled with %s %d.",
    4593              :                            "LOBLKSIZE", ControlFile->loblksize,
    4594              :                            "LOBLKSIZE", (int) LOBLKSIZE),
    4595              :                  errhint("It looks like you need to recompile or initdb.")));
    4596              : 
    4597              :     Assert(ControlFile->float8ByVal);    /* vestigial, not worth an error msg */
    4598              : 
    4599         1133 :     wal_segment_size = ControlFile->xlog_seg_size;
    4600              : 
    4601         1133 :     if (!IsValidWalSegSize(wal_segment_size))
    4602            0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4603              :                         errmsg_plural("invalid WAL segment size in control file (%d byte)",
    4604              :                                       "invalid WAL segment size in control file (%d bytes)",
    4605              :                                       wal_segment_size,
    4606              :                                       wal_segment_size),
    4607              :                         errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.")));
    4608              : 
    4609         1133 :     snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
    4610         1133 :     SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
    4611              :                     PGC_S_DYNAMIC_DEFAULT);
    4612              : 
    4613              :     /* check and update variables dependent on wal_segment_size */
    4614         1133 :     if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
    4615            0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4616              :         /* translator: both %s are GUC names */
    4617              :                         errmsg("\"%s\" must be at least twice \"%s\"",
    4618              :                                "min_wal_size", "wal_segment_size")));
    4619              : 
    4620         1133 :     if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
    4621            0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4622              :         /* translator: both %s are GUC names */
    4623              :                         errmsg("\"%s\" must be at least twice \"%s\"",
    4624              :                                "max_wal_size", "wal_segment_size")));
    4625              : 
    4626         1133 :     UsableBytesInSegment =
    4627         1133 :         (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
    4628              :         (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
    4629              : 
    4630         1133 :     CalculateCheckpointSegments();
    4631         1133 : }
    4632              : 
    4633              : /*
    4634              :  * Utility wrapper to update the control file.  Note that the control
    4635              :  * file gets flushed.
    4636              :  */
    4637              : static void
    4638        10163 : UpdateControlFile(void)
    4639              : {
    4640        10163 :     update_controlfile(DataDir, ControlFile, true);
    4641        10163 : }
    4642              : 
    4643              : /*
    4644              :  * Returns the unique system identifier from control file.
    4645              :  */
    4646              : uint64
    4647         1564 : GetSystemIdentifier(void)
    4648              : {
    4649              :     Assert(ControlFile != NULL);
    4650         1564 :     return ControlFile->system_identifier;
    4651              : }
    4652              : 
    4653              : /*
    4654              :  * Returns the random nonce from control file.
    4655              :  */
    4656              : char *
    4657            2 : GetMockAuthenticationNonce(void)
    4658              : {
    4659              :     Assert(ControlFile != NULL);
    4660            2 :     return ControlFile->mock_authentication_nonce;
    4661              : }
    4662              : 
    4663              : /*
    4664              :  * DataChecksumsNeedWrite
    4665              :  *      Returns whether data checksums must be written or not
    4666              :  *
    4667              :  * Returns true if data checksums are enabled, or are in the process of being
    4668              :  * enabled. During "inprogress-on" and "inprogress-off" states checksums must
    4669              :  * be written even though they are not verified (see datachecksum_state.c for
    4670              :  * a longer discussion).
    4671              :  *
    4672              :  * This function is intended for callsites which are about to write a data page
    4673              :  * to storage, and need to know whether to re-calculate the checksum for the
    4674              :  * page header. Calling this function must be performed as close to the write
    4675              :  * operation as possible to keep the critical section short.
    4676              :  */
    4677              : bool
    4678       852354 : DataChecksumsNeedWrite(void)
    4679              : {
    4680       942820 :     return (LocalDataChecksumState == PG_DATA_CHECKSUM_VERSION ||
    4681       907664 :             LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_ON ||
    4682        55310 :             LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_OFF);
    4683              : }
    4684              : 
    4685              : bool
    4686          167 : DataChecksumsInProgressOn(void)
    4687              : {
    4688          167 :     return LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_ON;
    4689              : }
    4690              : 
    4691              : /*
    4692              :  * DataChecksumsNeedVerify
    4693              :  *      Returns whether data checksums must be verified or not
    4694              :  *
    4695              :  * Data checksums are only verified if they are fully enabled in the cluster.
    4696              :  * During the "inprogress-on" and "inprogress-off" states they are only
    4697              :  * updated, not verified (see datachecksum_state.c for a longer discussion).
    4698              :  *
    4699              :  * This function is intended for callsites which have read data and are about
    4700              :  * to perform checksum validation based on the result of this.  Calling this
    4701              :  * function must be performed as close to the validation call as possible to
    4702              :  * keep the critical section short. This is in order to protect against time of
    4703              :  * check/time of use situations around data checksum validation.
    4704              :  */
    4705              : bool
    4706      2641443 : DataChecksumsNeedVerify(void)
    4707              : {
    4708      2641443 :     return (LocalDataChecksumState == PG_DATA_CHECKSUM_VERSION);
    4709              : }
    4710              : 
    4711              : /*
    4712              :  * SetDataChecksumsOnInProgress
    4713              :  *      Sets the data checksum state to "inprogress-on" to enable checksums
    4714              :  *
    4715              :  * To start the process of enabling data checksums in a running cluster the
    4716              :  * data_checksum_version state must be changed to "inprogress-on". See
    4717              :  * SetDataChecksumsOn below for a description on how this state change works.
    4718              :  * This function blocks until all backends in the cluster have acknowledged the
    4719              :  * state transition.
    4720              :  */
    4721              : void
    4722            8 : SetDataChecksumsOnInProgress(void)
    4723              : {
    4724              :     uint64      barrier;
    4725              : 
    4726              :     Assert(ControlFile != NULL);
    4727              : 
    4728              :     /*
    4729              :      * The state transition is performed in a critical section with
    4730              :      * checkpoints held off to provide crash safety.
    4731              :      */
    4732            8 :     START_CRIT_SECTION();
    4733            8 :     MyProc->delayChkptFlags |= DELAY_CHKPT_START;
    4734              : 
    4735            8 :     XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON);
    4736              : 
    4737            8 :     SpinLockAcquire(&XLogCtl->info_lck);
    4738            8 :     XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON;
    4739            8 :     SpinLockRelease(&XLogCtl->info_lck);
    4740              : 
    4741            8 :     barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
    4742              : 
    4743            8 :     MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
    4744            8 :     END_CRIT_SECTION();
    4745              : 
    4746              :     /*
    4747              :      * Update the controlfile before waiting since if we have an immediate
    4748              :      * shutdown while waiting we want to come back up with checksums enabled.
    4749              :      */
    4750            8 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    4751            8 :     ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON;
    4752            8 :     UpdateControlFile();
    4753            8 :     LWLockRelease(ControlFileLock);
    4754              : 
    4755              :     /*
    4756              :      * Await state change in all backends to ensure that all backends are in
    4757              :      * "inprogress-on". Once done we know that all backends are writing data
    4758              :      * checksums.
    4759              :      */
    4760            8 :     WaitForProcSignalBarrier(barrier);
    4761            8 : }
    4762              : 
    4763              : /*
    4764              :  * SetDataChecksumsOn
    4765              :  *      Set data checksums state to 'on' cluster-wide
    4766              :  *
    4767              :  * Enabling data checksums is performed using two barriers, the first one to
    4768              :  * set the state to "inprogress-on" (done by SetDataChecksumsOnInProgress())
    4769              :  * and the second one to set the state to "on" (done here). Below is a short
    4770              :  * description of the processing, a more detailed write-up can be found in
    4771              :  * datachecksum_state.c.
    4772              :  *
    4773              :  * To start the process of enabling data checksums in a running cluster the
    4774              :  * data_checksum_version state must be changed to "inprogress-on".  This state
    4775              :  * requires data checksums to be written but not verified. This ensures that
    4776              :  * all data pages can be checksummed without the risk of false negatives in
    4777              :  * validation during the process.  When all existing pages are guaranteed to
    4778              :  * have checksums, and all new pages will be initiated with checksums, the
    4779              :  * state can be changed to "on". Once the state is "on" checksums will be both
    4780              :  * written and verified.
    4781              :  *
    4782              :  * This function blocks until all backends in the cluster have acknowledged the
    4783              :  * state transition.
    4784              :  */
    4785              : void
    4786            6 : SetDataChecksumsOn(void)
    4787              : {
    4788              :     uint64      barrier;
    4789              : 
    4790              :     Assert(ControlFile != NULL);
    4791              : 
    4792            6 :     SpinLockAcquire(&XLogCtl->info_lck);
    4793              : 
    4794              :     /*
    4795              :      * The only allowed state transition to "on" is from "inprogress-on" since
    4796              :      * that state ensures that all pages will have data checksums written.  No
    4797              :      * such state transition exists, if it does happen it's likely due to a
    4798              :      * programmer error.
    4799              :      */
    4800            6 :     if (XLogCtl->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON)
    4801              :     {
    4802            0 :         SpinLockRelease(&XLogCtl->info_lck);
    4803            0 :         elog(WARNING,
    4804              :              "cannot set data checksums to \"on\", current state is not \"inprogress-on\", disabling");
    4805            0 :         SetDataChecksumsOff();
    4806            0 :         return;
    4807              :     }
    4808              : 
    4809            6 :     SpinLockRelease(&XLogCtl->info_lck);
    4810              : 
    4811            6 :     INJECTION_POINT("datachecksums-enable-checksums-delay", NULL);
    4812            6 :     START_CRIT_SECTION();
    4813            6 :     MyProc->delayChkptFlags |= DELAY_CHKPT_START;
    4814              : 
    4815            6 :     XLogChecksums(PG_DATA_CHECKSUM_VERSION);
    4816              : 
    4817            6 :     SpinLockAcquire(&XLogCtl->info_lck);
    4818            6 :     XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
    4819            6 :     SpinLockRelease(&XLogCtl->info_lck);
    4820              : 
    4821            6 :     barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
    4822              : 
    4823            6 :     MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
    4824            6 :     END_CRIT_SECTION();
    4825              : 
    4826              :     /*
    4827              :      * Update the controlfile before waiting since if we have an immediate
    4828              :      * shutdown while waiting we want to come back up with checksums enabled.
    4829              :      */
    4830            6 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    4831            6 :     ControlFile->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
    4832            6 :     UpdateControlFile();
    4833            6 :     LWLockRelease(ControlFileLock);
    4834              : 
    4835            6 :     RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
    4836              : 
    4837              :     /*
    4838              :      * Await state transition to "on" in all backends. When done we know that
    4839              :      * data checksums are both written and verified in all backends.
    4840              :      */
    4841            6 :     WaitForProcSignalBarrier(barrier);
    4842              : }
    4843              : 
    4844              : /*
    4845              :  * SetDataChecksumsOff
    4846              :  *      Disables data checksums cluster-wide
    4847              :  *
    4848              :  * Disabling data checksums must be performed with two sets of barriers, each
    4849              :  * carrying a different state. The state is first set to "inprogress-off"
    4850              :  * during which checksums are still written but not verified. This ensures that
    4851              :  * backends which have yet to observe the state change from "on" won't get
    4852              :  * validation errors on concurrently modified pages. Once all backends have
    4853              :  * changed to "inprogress-off", the barrier for moving to "off" can be emitted.
    4854              :  * This function blocks until all backends in the cluster have acknowledged the
    4855              :  * state transition.
    4856              :  */
    4857              : void
    4858            8 : SetDataChecksumsOff(void)
    4859              : {
    4860              :     uint64      barrier;
    4861              : 
    4862              :     Assert(ControlFile != NULL);
    4863              : 
    4864            8 :     SpinLockAcquire(&XLogCtl->info_lck);
    4865              : 
    4866              :     /* If data checksums are already disabled there is nothing to do */
    4867            8 :     if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_OFF)
    4868              :     {
    4869            2 :         SpinLockRelease(&XLogCtl->info_lck);
    4870            2 :         return;
    4871              :     }
    4872              : 
    4873              :     /*
    4874              :      * If data checksums are currently enabled we first transition to the
    4875              :      * "inprogress-off" state during which backends continue to write
    4876              :      * checksums without verifying them. When all backends are in
    4877              :      * "inprogress-off" the next transition to "off" can be performed, after
    4878              :      * which all data checksum processing is disabled.
    4879              :      */
    4880            6 :     if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
    4881              :     {
    4882            4 :         SpinLockRelease(&XLogCtl->info_lck);
    4883              : 
    4884            4 :         START_CRIT_SECTION();
    4885            4 :         MyProc->delayChkptFlags |= DELAY_CHKPT_START;
    4886              : 
    4887            4 :         XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_OFF);
    4888              : 
    4889            4 :         SpinLockAcquire(&XLogCtl->info_lck);
    4890            4 :         XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF;
    4891            4 :         SpinLockRelease(&XLogCtl->info_lck);
    4892              : 
    4893            4 :         barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
    4894              : 
    4895            4 :         MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
    4896            4 :         END_CRIT_SECTION();
    4897              : 
    4898            4 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    4899            4 :         ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF;
    4900            4 :         UpdateControlFile();
    4901            4 :         LWLockRelease(ControlFileLock);
    4902              : 
    4903            4 :         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
    4904              : 
    4905              :         /*
    4906              :          * Update local state in all backends to ensure that any backend in
    4907              :          * "on" state is changed to "inprogress-off".
    4908              :          */
    4909            4 :         WaitForProcSignalBarrier(barrier);
    4910              : 
    4911              :         /*
    4912              :          * At this point we know that no backends are verifying data checksums
    4913              :          * during reading. Next, we can safely move to state "off" to also
    4914              :          * stop writing checksums.
    4915              :          */
    4916              :     }
    4917              :     else
    4918              :     {
    4919              :         /*
    4920              :          * Ending up here implies that the checksums state is "inprogress-on"
    4921              :          * or "inprogress-off" and we can transition directly to "off" from
    4922              :          * there.
    4923              :          */
    4924            2 :         SpinLockRelease(&XLogCtl->info_lck);
    4925              :     }
    4926              : 
    4927            6 :     START_CRIT_SECTION();
    4928              :     /* Ensure that we don't incur a checkpoint during disabling checksums */
    4929            6 :     MyProc->delayChkptFlags |= DELAY_CHKPT_START;
    4930              : 
    4931            6 :     XLogChecksums(PG_DATA_CHECKSUM_OFF);
    4932              : 
    4933            6 :     SpinLockAcquire(&XLogCtl->info_lck);
    4934            6 :     XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_OFF;
    4935            6 :     SpinLockRelease(&XLogCtl->info_lck);
    4936              : 
    4937            6 :     barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
    4938              : 
    4939            6 :     MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
    4940            6 :     END_CRIT_SECTION();
    4941              : 
    4942            6 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    4943            6 :     ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF;
    4944            6 :     UpdateControlFile();
    4945            6 :     LWLockRelease(ControlFileLock);
    4946              : 
    4947            6 :     RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
    4948              : 
    4949            6 :     WaitForProcSignalBarrier(barrier);
    4950              : }
    4951              : 
    4952              : /*
    4953              :  * InitLocalDataChecksumState
    4954              :  *
    4955              :  * Set up backend local caches of controldata variables which may change at
    4956              :  * any point during runtime and thus require special cased locking. So far
    4957              :  * this only applies to data_checksum_version, but it's intended to be general
    4958              :  * purpose enough to handle future cases.
    4959              :  */
    4960              : void
    4961        24658 : InitLocalDataChecksumState(void)
    4962              : {
    4963        24658 :     SpinLockAcquire(&XLogCtl->info_lck);
    4964        24658 :     SetLocalDataChecksumState(XLogCtl->data_checksum_version);
    4965        24658 :     SpinLockRelease(&XLogCtl->info_lck);
    4966        24658 : }
    4967              : 
    4968              : void
    4969        27253 : SetLocalDataChecksumState(uint32 data_checksum_version)
    4970              : {
    4971        27253 :     LocalDataChecksumState = data_checksum_version;
    4972              : 
    4973        27253 :     data_checksums = data_checksum_version;
    4974        27253 : }
    4975              : 
    4976              : /* guc hook */
    4977              : const char *
    4978            0 : show_data_checksums(void)
    4979              : {
    4980            0 :     return get_checksum_state_string(LocalDataChecksumState);
    4981              : }
    4982              : 
    4983              : /*
    4984              :  * Return true if the cluster was initialized on a platform where the
    4985              :  * default signedness of char is "signed". This function exists for code
    4986              :  * that deals with pre-v18 data files that store data sorted by the 'char'
    4987              :  * type on disk (e.g., GIN and GiST indexes). See the comments in
    4988              :  * WriteControlFile() for details.
    4989              :  */
    4990              : bool
    4991        89903 : GetDefaultCharSignedness(void)
    4992              : {
    4993        89903 :     return ControlFile->default_char_signedness;
    4994              : }
    4995              : 
    4996              : /*
    4997              :  * Returns a fake LSN for unlogged relations.
    4998              :  *
    4999              :  * Each call generates an LSN that is greater than any previous value
    5000              :  * returned. The current counter value is saved and restored across clean
    5001              :  * shutdowns, but like unlogged relations, does not survive a crash. This can
    5002              :  * be used in lieu of real LSN values returned by XLogInsert, if you need an
    5003              :  * LSN-like increasing sequence of numbers without writing any WAL.
    5004              :  */
    5005              : XLogRecPtr
    5006       201460 : GetFakeLSNForUnloggedRel(void)
    5007              : {
    5008       201460 :     return pg_atomic_fetch_add_u64(&XLogCtl->unloggedLSN, 1);
    5009              : }
    5010              : 
    5011              : /*
    5012              :  * Auto-tune the number of XLOG buffers.
    5013              :  *
    5014              :  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
    5015              :  * a maximum of one XLOG segment (there is little reason to think that more
    5016              :  * is helpful, at least so long as we force an fsync when switching log files)
    5017              :  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
    5018              :  * 9.1, when auto-tuning was added).
    5019              :  *
    5020              :  * This should not be called until NBuffers has received its final value.
    5021              :  */
    5022              : static int
    5023         1229 : XLOGChooseNumBuffers(void)
    5024              : {
    5025              :     int         xbuffers;
    5026              : 
    5027         1229 :     xbuffers = NBuffers / 32;
    5028         1229 :     if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
    5029           24 :         xbuffers = (wal_segment_size / XLOG_BLCKSZ);
    5030         1229 :     if (xbuffers < 8)
    5031          457 :         xbuffers = 8;
    5032         1229 :     return xbuffers;
    5033              : }
    5034              : 
    5035              : /*
    5036              :  * GUC check_hook for wal_buffers
    5037              :  */
    5038              : bool
    5039         2504 : check_wal_buffers(int *newval, void **extra, GucSource source)
    5040              : {
    5041              :     /*
    5042              :      * -1 indicates a request for auto-tune.
    5043              :      */
    5044         2504 :     if (*newval == -1)
    5045              :     {
    5046              :         /*
    5047              :          * If we haven't yet changed the boot_val default of -1, just let it
    5048              :          * be.  We'll fix it when XLOGShmemSize is called.
    5049              :          */
    5050         1275 :         if (XLOGbuffers == -1)
    5051         1275 :             return true;
    5052              : 
    5053              :         /* Otherwise, substitute the auto-tune value */
    5054            0 :         *newval = XLOGChooseNumBuffers();
    5055              :     }
    5056              : 
    5057              :     /*
    5058              :      * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
    5059              :      * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
    5060              :      * the case, we just silently treat such values as a request for the
    5061              :      * minimum.  (We could throw an error instead, but that doesn't seem very
    5062              :      * helpful.)
    5063              :      */
    5064         1229 :     if (*newval < 4)
    5065            0 :         *newval = 4;
    5066              : 
    5067         1229 :     return true;
    5068              : }
    5069              : 
    5070              : /*
    5071              :  * GUC check_hook for wal_consistency_checking
    5072              :  */
    5073              : bool
    5074         2263 : check_wal_consistency_checking(char **newval, void **extra, GucSource source)
    5075              : {
    5076              :     char       *rawstring;
    5077              :     List       *elemlist;
    5078              :     ListCell   *l;
    5079              :     bool        newwalconsistency[RM_MAX_ID + 1];
    5080              : 
    5081              :     /* Initialize the array */
    5082        74679 :     MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool));
    5083              : 
    5084              :     /* Need a modifiable copy of string */
    5085         2263 :     rawstring = pstrdup(*newval);
    5086              : 
    5087              :     /* Parse string into list of identifiers */
    5088         2263 :     if (!SplitIdentifierString(rawstring, ',', &elemlist))
    5089              :     {
    5090              :         /* syntax error in list */
    5091            0 :         GUC_check_errdetail("List syntax is invalid.");
    5092            0 :         pfree(rawstring);
    5093            0 :         list_free(elemlist);
    5094            0 :         return false;
    5095              :     }
    5096              : 
    5097         2760 :     foreach(l, elemlist)
    5098              :     {
    5099          497 :         char       *tok = (char *) lfirst(l);
    5100              :         int         rmid;
    5101              : 
    5102              :         /* Check for 'all'. */
    5103          497 :         if (pg_strcasecmp(tok, "all") == 0)
    5104              :         {
    5105       127215 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    5106       126720 :                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL)
    5107         4950 :                     newwalconsistency[rmid] = true;
    5108              :         }
    5109              :         else
    5110              :         {
    5111              :             /* Check if the token matches any known resource manager. */
    5112            2 :             bool        found = false;
    5113              : 
    5114           36 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    5115              :             {
    5116           54 :                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL &&
    5117           18 :                     pg_strcasecmp(tok, GetRmgr(rmid).rm_name) == 0)
    5118              :                 {
    5119            2 :                     newwalconsistency[rmid] = true;
    5120            2 :                     found = true;
    5121            2 :                     break;
    5122              :                 }
    5123              :             }
    5124            2 :             if (!found)
    5125              :             {
    5126              :                 /*
    5127              :                  * During startup, it might be a not-yet-loaded custom
    5128              :                  * resource manager.  Defer checking until
    5129              :                  * InitializeWalConsistencyChecking().
    5130              :                  */
    5131            0 :                 if (!process_shared_preload_libraries_done)
    5132              :                 {
    5133            0 :                     check_wal_consistency_checking_deferred = true;
    5134              :                 }
    5135              :                 else
    5136              :                 {
    5137            0 :                     GUC_check_errdetail("Unrecognized key word: \"%s\".", tok);
    5138            0 :                     pfree(rawstring);
    5139            0 :                     list_free(elemlist);
    5140            0 :                     return false;
    5141              :                 }
    5142              :             }
    5143              :         }
    5144              :     }
    5145              : 
    5146         2263 :     pfree(rawstring);
    5147         2263 :     list_free(elemlist);
    5148              : 
    5149              :     /* assign new value */
    5150         2263 :     *extra = guc_malloc(LOG, (RM_MAX_ID + 1) * sizeof(bool));
    5151         2263 :     if (!*extra)
    5152            0 :         return false;
    5153         2263 :     memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool));
    5154         2263 :     return true;
    5155              : }
    5156              : 
    5157              : /*
    5158              :  * GUC assign_hook for wal_consistency_checking
    5159              :  */
    5160              : void
    5161         2262 : assign_wal_consistency_checking(const char *newval, void *extra)
    5162              : {
    5163              :     /*
    5164              :      * If some checks were deferred, it's possible that the checks will fail
    5165              :      * later during InitializeWalConsistencyChecking(). But in that case, the
    5166              :      * postmaster will exit anyway, so it's safe to proceed with the
    5167              :      * assignment.
    5168              :      *
    5169              :      * Any built-in resource managers specified are assigned immediately,
    5170              :      * which affects WAL created before shared_preload_libraries are
    5171              :      * processed. Any custom resource managers specified won't be assigned
    5172              :      * until after shared_preload_libraries are processed, but that's OK
    5173              :      * because WAL for a custom resource manager can't be written before the
    5174              :      * module is loaded anyway.
    5175              :      */
    5176         2262 :     wal_consistency_checking = extra;
    5177         2262 : }
    5178              : 
    5179              : /*
    5180              :  * InitializeWalConsistencyChecking: run after loading custom resource managers
    5181              :  *
    5182              :  * If any unknown resource managers were specified in the
    5183              :  * wal_consistency_checking GUC, processing was deferred.  Now that
    5184              :  * shared_preload_libraries have been loaded, process wal_consistency_checking
    5185              :  * again.
    5186              :  */
    5187              : void
    5188         1058 : InitializeWalConsistencyChecking(void)
    5189              : {
    5190              :     Assert(process_shared_preload_libraries_done);
    5191              : 
    5192         1058 :     if (check_wal_consistency_checking_deferred)
    5193              :     {
    5194              :         struct config_generic *guc;
    5195              : 
    5196            0 :         guc = find_option("wal_consistency_checking", false, false, ERROR);
    5197              : 
    5198            0 :         check_wal_consistency_checking_deferred = false;
    5199              : 
    5200            0 :         set_config_option_ext("wal_consistency_checking",
    5201              :                               wal_consistency_checking_string,
    5202              :                               guc->scontext, guc->source, guc->srole,
    5203              :                               GUC_ACTION_SET, true, ERROR, false);
    5204              : 
    5205              :         /* checking should not be deferred again */
    5206              :         Assert(!check_wal_consistency_checking_deferred);
    5207              :     }
    5208         1058 : }
    5209              : 
    5210              : /*
    5211              :  * GUC show_hook for archive_command
    5212              :  */
    5213              : const char *
    5214         1951 : show_archive_command(void)
    5215              : {
    5216         1951 :     if (XLogArchivingActive())
    5217          100 :         return XLogArchiveCommand;
    5218              :     else
    5219         1851 :         return "(disabled)";
    5220              : }
    5221              : 
    5222              : /*
    5223              :  * GUC show_hook for in_hot_standby
    5224              :  */
    5225              : const char *
    5226        17136 : show_in_hot_standby(void)
    5227              : {
    5228              :     /*
    5229              :      * We display the actual state based on shared memory, so that this GUC
    5230              :      * reports up-to-date state if examined intra-query.  The underlying
    5231              :      * variable (in_hot_standby_guc) changes only when we transmit a new value
    5232              :      * to the client.
    5233              :      */
    5234        17136 :     return RecoveryInProgress() ? "on" : "off";
    5235              : }
    5236              : 
    5237              : /*
    5238              :  * GUC show_hook for effective_wal_level
    5239              :  */
    5240              : const char *
    5241         1984 : show_effective_wal_level(void)
    5242              : {
    5243         1984 :     if (wal_level == WAL_LEVEL_MINIMAL)
    5244          306 :         return "minimal";
    5245              : 
    5246              :     /*
    5247              :      * During recovery, effective_wal_level reflects the primary's
    5248              :      * configuration rather than the local wal_level value.
    5249              :      */
    5250         1678 :     if (RecoveryInProgress())
    5251           27 :         return IsXLogLogicalInfoEnabled() ? "logical" : "replica";
    5252              : 
    5253         1651 :     return XLogLogicalInfoActive() ? "logical" : "replica";
    5254              : }
    5255              : 
    5256              : /*
    5257              :  * Read the control file, set respective GUCs.
    5258              :  *
    5259              :  * This is to be called during startup, including a crash recovery cycle,
    5260              :  * unless in bootstrap mode, where no control file yet exists.  As there's no
    5261              :  * usable shared memory yet (its sizing can depend on the contents of the
    5262              :  * control file!), first store the contents in local memory. XLOGShmemInit()
    5263              :  * will then copy it to shared memory later.
    5264              :  *
    5265              :  * reset just controls whether previous contents are to be expected (in the
    5266              :  * reset case, there's a dangling pointer into old shared memory), or not.
    5267              :  */
    5268              : void
    5269         1076 : LocalProcessControlFile(bool reset)
    5270              : {
    5271              :     Assert(reset || ControlFile == NULL);
    5272         1076 :     LocalControlFile = palloc_object(ControlFileData);
    5273         1076 :     ControlFile = LocalControlFile;
    5274         1076 :     ReadControlFile();
    5275         1076 :     SetLocalDataChecksumState(ControlFile->data_checksum_version);
    5276         1076 : }
    5277              : 
    5278              : /*
    5279              :  * Get the wal_level from the control file. For a standby, this value should be
    5280              :  * considered as its active wal_level, because it may be different from what
    5281              :  * was originally configured on standby.
    5282              :  */
    5283              : WalLevel
    5284            0 : GetActiveWalLevelOnStandby(void)
    5285              : {
    5286            0 :     return ControlFile->wal_level;
    5287              : }
    5288              : 
    5289              : /*
    5290              :  * Register shared memory for XLOG.
    5291              :  */
    5292              : static void
    5293         1234 : XLOGShmemRequest(void *arg)
    5294              : {
    5295              :     Size        size;
    5296              : 
    5297              :     /*
    5298              :      * If the value of wal_buffers is -1, use the preferred auto-tune value.
    5299              :      * This isn't an amazingly clean place to do this, but we must wait till
    5300              :      * NBuffers has received its final value, and must do it before using the
    5301              :      * value of XLOGbuffers to do anything important.
    5302              :      *
    5303              :      * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
    5304              :      * However, if the DBA explicitly set wal_buffers = -1 in the config file,
    5305              :      * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force
    5306              :      * the matter with PGC_S_OVERRIDE.
    5307              :      */
    5308         1234 :     if (XLOGbuffers == -1)
    5309              :     {
    5310              :         char        buf[32];
    5311              : 
    5312         1229 :         snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
    5313         1229 :         SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
    5314              :                         PGC_S_DYNAMIC_DEFAULT);
    5315         1229 :         if (XLOGbuffers == -1)  /* failed to apply it? */
    5316            0 :             SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
    5317              :                             PGC_S_OVERRIDE);
    5318              :     }
    5319              :     Assert(XLOGbuffers > 0);
    5320              : 
    5321              :     /* XLogCtl */
    5322         1234 :     size = sizeof(XLogCtlData);
    5323              : 
    5324              :     /* WAL insertion locks, plus alignment */
    5325         1234 :     size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
    5326              :     /* xlblocks array */
    5327         1234 :     size = add_size(size, mul_size(sizeof(pg_atomic_uint64), XLOGbuffers));
    5328              :     /* extra alignment padding for XLOG I/O buffers */
    5329         1234 :     size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
    5330              :     /* and the buffers themselves */
    5331         1234 :     size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
    5332              : 
    5333         1234 :     ShmemRequestStruct(.name = "XLOG Ctl",
    5334              :                        .size = size,
    5335              :                        .ptr = (void **) &XLogCtl,
    5336              :         );
    5337         1234 :     ShmemRequestStruct(.name = "Control File",
    5338              :                        .size = sizeof(ControlFileData),
    5339              :                        .ptr = (void **) &ControlFile,
    5340              :         );
    5341         1234 : }
    5342              : 
    5343              : /*
    5344              :  * XLOGShmemInit - initialize the XLogCtl shared memory area.
    5345              :  */
    5346              : static void
    5347         1231 : XLOGShmemInit(void *arg)
    5348              : {
    5349              :     char       *allocptr;
    5350              :     int         i;
    5351              : 
    5352              : #ifdef WAL_DEBUG
    5353              : 
    5354              :     /*
    5355              :      * Create a memory context for WAL debugging that's exempt from the normal
    5356              :      * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
    5357              :      * an allocation fails, but wal_debug is not for production use anyway.
    5358              :      */
    5359              :     if (walDebugCxt == NULL)
    5360              :     {
    5361              :         walDebugCxt = AllocSetContextCreate(TopMemoryContext,
    5362              :                                             "WAL Debug",
    5363              :                                             ALLOCSET_DEFAULT_SIZES);
    5364              :         MemoryContextAllowInCriticalSection(walDebugCxt, true);
    5365              :     }
    5366              : #endif
    5367              : 
    5368         1231 :     memset(XLogCtl, 0, sizeof(XLogCtlData));
    5369              : 
    5370              :     /*
    5371              :      * Already have read control file locally, unless in bootstrap mode. Move
    5372              :      * contents into shared memory.
    5373              :      */
    5374         1231 :     if (LocalControlFile)
    5375              :     {
    5376         1060 :         memcpy(ControlFile, LocalControlFile, sizeof(ControlFileData));
    5377         1060 :         pfree(LocalControlFile);
    5378         1060 :         LocalControlFile = NULL;
    5379              :     }
    5380              : 
    5381              :     /*
    5382              :      * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
    5383              :      * multiple of the alignment for same, so no extra alignment padding is
    5384              :      * needed here.
    5385              :      */
    5386         1231 :     allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
    5387         1231 :     XLogCtl->xlblocks = (pg_atomic_uint64 *) allocptr;
    5388         1231 :     allocptr += sizeof(pg_atomic_uint64) * XLOGbuffers;
    5389              : 
    5390       363030 :     for (i = 0; i < XLOGbuffers; i++)
    5391              :     {
    5392       361799 :         pg_atomic_init_u64(&XLogCtl->xlblocks[i], InvalidXLogRecPtr);
    5393              :     }
    5394              : 
    5395              :     /* WAL insertion locks. Ensure they're aligned to the full padded size */
    5396         1231 :     allocptr += sizeof(WALInsertLockPadded) -
    5397         1231 :         ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
    5398         1231 :     WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
    5399              :         (WALInsertLockPadded *) allocptr;
    5400         1231 :     allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
    5401              : 
    5402        11079 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    5403              :     {
    5404         9848 :         LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
    5405         9848 :         pg_atomic_init_u64(&WALInsertLocks[i].l.insertingAt, InvalidXLogRecPtr);
    5406         9848 :         WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
    5407              :     }
    5408              : 
    5409              :     /*
    5410              :      * Align the start of the page buffers to a full xlog block size boundary.
    5411              :      * This simplifies some calculations in XLOG insertion. It is also
    5412              :      * required for O_DIRECT.
    5413              :      */
    5414         1231 :     allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
    5415         1231 :     XLogCtl->pages = allocptr;
    5416         1231 :     memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
    5417              : 
    5418              :     /*
    5419              :      * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
    5420              :      * in additional info.)
    5421              :      */
    5422         1231 :     XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
    5423         1231 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
    5424         1231 :     XLogCtl->InstallXLogFileSegmentActive = false;
    5425         1231 :     XLogCtl->WalWriterSleeping = false;
    5426              : 
    5427              :     /* Use the checksum info from control file */
    5428         1231 :     XLogCtl->data_checksum_version = ControlFile->data_checksum_version;
    5429              : 
    5430         1231 :     SetLocalDataChecksumState(XLogCtl->data_checksum_version);
    5431              : 
    5432         1231 :     SpinLockInit(&XLogCtl->Insert.insertpos_lck);
    5433         1231 :     SpinLockInit(&XLogCtl->info_lck);
    5434         1231 :     pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr);
    5435         1231 :     pg_atomic_init_u64(&XLogCtl->logWriteResult, InvalidXLogRecPtr);
    5436         1231 :     pg_atomic_init_u64(&XLogCtl->logFlushResult, InvalidXLogRecPtr);
    5437         1231 :     pg_atomic_init_u64(&XLogCtl->unloggedLSN, InvalidXLogRecPtr);
    5438         1231 : }
    5439              : 
    5440              : /*
    5441              :  * XLOGShmemAttach - re-establish WALInsertLocks pointer after attaching.
    5442              :  */
    5443              : static void
    5444            0 : XLOGShmemAttach(void *arg)
    5445              : {
    5446            0 :     WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
    5447            0 : }
    5448              : 
    5449              : /*
    5450              :  * This func must be called ONCE on system install.  It creates pg_control
    5451              :  * and the initial XLOG segment.
    5452              :  */
    5453              : void
    5454           57 : BootStrapXLOG(uint32 data_checksum_version)
    5455              : {
    5456              :     CheckPoint  checkPoint;
    5457              :     PGAlignedXLogBlock buffer;
    5458              :     XLogPageHeader page;
    5459              :     XLogLongPageHeader longpage;
    5460              :     XLogRecord *record;
    5461              :     char       *recptr;
    5462              :     uint64      sysidentifier;
    5463              :     struct timeval tv;
    5464              :     pg_crc32c   crc;
    5465              : 
    5466              :     /* allow ordinary WAL segment creation, like StartupXLOG() would */
    5467           57 :     SetInstallXLogFileSegmentActive();
    5468              : 
    5469              :     /*
    5470              :      * Select a hopefully-unique system identifier code for this installation.
    5471              :      * We use the result of gettimeofday(), including the fractional seconds
    5472              :      * field, as being about as unique as we can easily get.  (Think not to
    5473              :      * use random(), since it hasn't been seeded and there's no portable way
    5474              :      * to seed it other than the system clock value...)  The upper half of the
    5475              :      * uint64 value is just the tv_sec part, while the lower half contains the
    5476              :      * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
    5477              :      * PID for a little extra uniqueness.  A person knowing this encoding can
    5478              :      * determine the initialization time of the installation, which could
    5479              :      * perhaps be useful sometimes.
    5480              :      */
    5481           57 :     gettimeofday(&tv, NULL);
    5482           57 :     sysidentifier = ((uint64) tv.tv_sec) << 32;
    5483           57 :     sysidentifier |= ((uint64) tv.tv_usec) << 12;
    5484           57 :     sysidentifier |= getpid() & 0xFFF;
    5485              : 
    5486           57 :     memset(&buffer, 0, sizeof buffer);
    5487           57 :     page = (XLogPageHeader) &buffer;
    5488              : 
    5489              :     /*
    5490              :      * Set up information for the initial checkpoint record
    5491              :      *
    5492              :      * The initial checkpoint record is written to the beginning of the WAL
    5493              :      * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
    5494              :      * used, so that we can use 0/0 to mean "before any valid WAL segment".
    5495              :      */
    5496           57 :     checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
    5497           57 :     checkPoint.ThisTimeLineID = BootstrapTimeLineID;
    5498           57 :     checkPoint.PrevTimeLineID = BootstrapTimeLineID;
    5499           57 :     checkPoint.fullPageWrites = fullPageWrites;
    5500           57 :     checkPoint.logicalDecodingEnabled = (wal_level == WAL_LEVEL_LOGICAL);
    5501           57 :     checkPoint.wal_level = wal_level;
    5502              :     checkPoint.nextXid =
    5503           57 :         FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
    5504           57 :     checkPoint.nextOid = FirstGenbkiObjectId;
    5505           57 :     checkPoint.nextMulti = FirstMultiXactId;
    5506           57 :     checkPoint.nextMultiOffset = 1;
    5507           57 :     checkPoint.oldestXid = FirstNormalTransactionId;
    5508           57 :     checkPoint.oldestXidDB = Template1DbOid;
    5509           57 :     checkPoint.oldestMulti = FirstMultiXactId;
    5510           57 :     checkPoint.oldestMultiDB = Template1DbOid;
    5511           57 :     checkPoint.oldestCommitTsXid = InvalidTransactionId;
    5512           57 :     checkPoint.newestCommitTsXid = InvalidTransactionId;
    5513           57 :     checkPoint.time = (pg_time_t) time(NULL);
    5514           57 :     checkPoint.oldestActiveXid = InvalidTransactionId;
    5515           57 :     checkPoint.dataChecksumState = data_checksum_version;
    5516              : 
    5517           57 :     TransamVariables->nextXid = checkPoint.nextXid;
    5518           57 :     TransamVariables->nextOid = checkPoint.nextOid;
    5519           57 :     TransamVariables->oidCount = 0;
    5520           57 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    5521           57 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    5522           57 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    5523           57 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
    5524           57 :     SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
    5525              : 
    5526              :     /* Set up the XLOG page header */
    5527           57 :     page->xlp_magic = XLOG_PAGE_MAGIC;
    5528           57 :     page->xlp_info = XLP_LONG_HEADER;
    5529           57 :     page->xlp_tli = BootstrapTimeLineID;
    5530           57 :     page->xlp_pageaddr = wal_segment_size;
    5531           57 :     longpage = (XLogLongPageHeader) page;
    5532           57 :     longpage->xlp_sysid = sysidentifier;
    5533           57 :     longpage->xlp_seg_size = wal_segment_size;
    5534           57 :     longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    5535              : 
    5536              :     /* Insert the initial checkpoint record */
    5537           57 :     recptr = ((char *) page + SizeOfXLogLongPHD);
    5538           57 :     record = (XLogRecord *) recptr;
    5539           57 :     record->xl_prev = InvalidXLogRecPtr;
    5540           57 :     record->xl_xid = InvalidTransactionId;
    5541           57 :     record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
    5542           57 :     record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
    5543           57 :     record->xl_rmid = RM_XLOG_ID;
    5544           57 :     recptr += SizeOfXLogRecord;
    5545              :     /* fill the XLogRecordDataHeaderShort struct */
    5546           57 :     *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
    5547           57 :     *(recptr++) = sizeof(checkPoint);
    5548           57 :     memcpy(recptr, &checkPoint, sizeof(checkPoint));
    5549           57 :     recptr += sizeof(checkPoint);
    5550              :     Assert(recptr - (char *) record == record->xl_tot_len);
    5551              : 
    5552           57 :     INIT_CRC32C(crc);
    5553           57 :     COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
    5554           57 :     COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
    5555           57 :     FIN_CRC32C(crc);
    5556           57 :     record->xl_crc = crc;
    5557              : 
    5558              :     /* Create first XLOG segment file */
    5559           57 :     openLogTLI = BootstrapTimeLineID;
    5560           57 :     openLogFile = XLogFileInit(1, BootstrapTimeLineID);
    5561              : 
    5562              :     /*
    5563              :      * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
    5564              :      * close the file again in a moment.
    5565              :      */
    5566              : 
    5567              :     /* Write the first page with the initial record */
    5568           57 :     errno = 0;
    5569           57 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
    5570           57 :     if (write(openLogFile, &buffer, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    5571              :     {
    5572              :         /* if write didn't set errno, assume problem is no disk space */
    5573            0 :         if (errno == 0)
    5574            0 :             errno = ENOSPC;
    5575            0 :         ereport(PANIC,
    5576              :                 (errcode_for_file_access(),
    5577              :                  errmsg("could not write bootstrap write-ahead log file: %m")));
    5578              :     }
    5579           57 :     pgstat_report_wait_end();
    5580              : 
    5581           57 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
    5582           57 :     if (pg_fsync(openLogFile) != 0)
    5583            0 :         ereport(PANIC,
    5584              :                 (errcode_for_file_access(),
    5585              :                  errmsg("could not fsync bootstrap write-ahead log file: %m")));
    5586           57 :     pgstat_report_wait_end();
    5587              : 
    5588           57 :     if (close(openLogFile) != 0)
    5589            0 :         ereport(PANIC,
    5590              :                 (errcode_for_file_access(),
    5591              :                  errmsg("could not close bootstrap write-ahead log file: %m")));
    5592              : 
    5593           57 :     openLogFile = -1;
    5594              : 
    5595              :     /* Now create pg_control */
    5596           57 :     InitControlFile(sysidentifier, data_checksum_version);
    5597           57 :     ControlFile->time = checkPoint.time;
    5598           57 :     ControlFile->checkPoint = checkPoint.redo;
    5599           57 :     ControlFile->checkPointCopy = checkPoint;
    5600              : 
    5601              :     /* some additional ControlFile fields are set in WriteControlFile() */
    5602           57 :     WriteControlFile();
    5603              : 
    5604              :     /* Bootstrap the commit log, too */
    5605           57 :     BootStrapCLOG();
    5606           57 :     BootStrapCommitTs();
    5607           57 :     BootStrapSUBTRANS();
    5608           57 :     BootStrapMultiXact();
    5609              : 
    5610              :     /*
    5611              :      * Force control file to be read - in contrast to normal processing we'd
    5612              :      * otherwise never run the checks and GUC related initializations therein.
    5613              :      */
    5614           57 :     ReadControlFile();
    5615           57 : }
    5616              : 
    5617              : static char *
    5618          950 : str_time(pg_time_t tnow, char *buf, size_t bufsize)
    5619              : {
    5620          950 :     pg_strftime(buf, bufsize,
    5621              :                 "%Y-%m-%d %H:%M:%S %Z",
    5622          950 :                 pg_localtime(&tnow, log_timezone));
    5623              : 
    5624          950 :     return buf;
    5625              : }
    5626              : 
    5627              : /*
    5628              :  * Initialize the first WAL segment on new timeline.
    5629              :  */
    5630              : static void
    5631           55 : XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
    5632              : {
    5633              :     char        xlogfname[MAXFNAMELEN];
    5634              :     XLogSegNo   endLogSegNo;
    5635              :     XLogSegNo   startLogSegNo;
    5636              : 
    5637              :     /* we always switch to a new timeline after archive recovery */
    5638              :     Assert(endTLI != newTLI);
    5639              : 
    5640              :     /*
    5641              :      * Update min recovery point one last time.
    5642              :      */
    5643           55 :     UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    5644              : 
    5645              :     /*
    5646              :      * Calculate the last segment on the old timeline, and the first segment
    5647              :      * on the new timeline. If the switch happens in the middle of a segment,
    5648              :      * they are the same, but if the switch happens exactly at a segment
    5649              :      * boundary, startLogSegNo will be endLogSegNo + 1.
    5650              :      */
    5651           55 :     XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
    5652           55 :     XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
    5653              : 
    5654              :     /*
    5655              :      * Initialize the starting WAL segment for the new timeline. If the switch
    5656              :      * happens in the middle of a segment, copy data from the last WAL segment
    5657              :      * of the old timeline up to the switch point, to the starting WAL segment
    5658              :      * on the new timeline.
    5659              :      */
    5660           55 :     if (endLogSegNo == startLogSegNo)
    5661              :     {
    5662              :         /*
    5663              :          * Make a copy of the file on the new timeline.
    5664              :          *
    5665              :          * Writing WAL isn't allowed yet, so there are no locking
    5666              :          * considerations. But we should be just as tense as XLogFileInit to
    5667              :          * avoid emplacing a bogus file.
    5668              :          */
    5669           43 :         XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
    5670           43 :                      XLogSegmentOffset(endOfLog, wal_segment_size));
    5671              :     }
    5672              :     else
    5673              :     {
    5674              :         /*
    5675              :          * The switch happened at a segment boundary, so just create the next
    5676              :          * segment on the new timeline.
    5677              :          */
    5678              :         int         fd;
    5679              : 
    5680           12 :         fd = XLogFileInit(startLogSegNo, newTLI);
    5681              : 
    5682           12 :         if (close(fd) != 0)
    5683              :         {
    5684            0 :             int         save_errno = errno;
    5685              : 
    5686            0 :             XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
    5687            0 :             errno = save_errno;
    5688            0 :             ereport(ERROR,
    5689              :                     (errcode_for_file_access(),
    5690              :                      errmsg("could not close file \"%s\": %m", xlogfname)));
    5691              :         }
    5692              :     }
    5693              : 
    5694              :     /*
    5695              :      * Let's just make real sure there are not .ready or .done flags posted
    5696              :      * for the new segment.
    5697              :      */
    5698           55 :     XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
    5699           55 :     XLogArchiveCleanup(xlogfname);
    5700           55 : }
    5701              : 
    5702              : /*
    5703              :  * Perform cleanup actions at the conclusion of archive recovery.
    5704              :  */
    5705              : static void
    5706           55 : CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
    5707              :                             TimeLineID newTLI)
    5708              : {
    5709              :     /*
    5710              :      * Execute the recovery_end_command, if any.
    5711              :      */
    5712           55 :     if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
    5713            2 :         ExecuteRecoveryCommand(recoveryEndCommand,
    5714              :                                "recovery_end_command",
    5715              :                                true,
    5716              :                                WAIT_EVENT_RECOVERY_END_COMMAND);
    5717              : 
    5718              :     /*
    5719              :      * We switched to a new timeline. Clean up segments on the old timeline.
    5720              :      *
    5721              :      * If there are any higher-numbered segments on the old timeline, remove
    5722              :      * them. They might contain valid WAL, but they might also be
    5723              :      * pre-allocated files containing garbage. In any case, they are not part
    5724              :      * of the new timeline's history so we don't need them.
    5725              :      */
    5726           55 :     RemoveNonParentXlogFiles(EndOfLog, newTLI);
    5727              : 
    5728              :     /*
    5729              :      * If the switch happened in the middle of a segment, what to do with the
    5730              :      * last, partial segment on the old timeline? If we don't archive it, and
    5731              :      * the server that created the WAL never archives it either (e.g. because
    5732              :      * it was hit by a meteor), it will never make it to the archive. That's
    5733              :      * OK from our point of view, because the new segment that we created with
    5734              :      * the new TLI contains all the WAL from the old timeline up to the switch
    5735              :      * point. But if you later try to do PITR to the "missing" WAL on the old
    5736              :      * timeline, recovery won't find it in the archive. It's physically
    5737              :      * present in the new file with new TLI, but recovery won't look there
    5738              :      * when it's recovering to the older timeline. On the other hand, if we
    5739              :      * archive the partial segment, and the original server on that timeline
    5740              :      * is still running and archives the completed version of the same segment
    5741              :      * later, it will fail. (We used to do that in 9.4 and below, and it
    5742              :      * caused such problems).
    5743              :      *
    5744              :      * As a compromise, we rename the last segment with the .partial suffix,
    5745              :      * and archive it. Archive recovery will never try to read .partial
    5746              :      * segments, so they will normally go unused. But in the odd PITR case,
    5747              :      * the administrator can copy them manually to the pg_wal directory
    5748              :      * (removing the suffix). They can be useful in debugging, too.
    5749              :      *
    5750              :      * If a .done or .ready file already exists for the old timeline, however,
    5751              :      * we had already determined that the segment is complete, so we can let
    5752              :      * it be archived normally. (In particular, if it was restored from the
    5753              :      * archive to begin with, it's expected to have a .done file).
    5754              :      */
    5755           55 :     if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
    5756              :         XLogArchivingActive())
    5757              :     {
    5758              :         char        origfname[MAXFNAMELEN];
    5759              :         XLogSegNo   endLogSegNo;
    5760              : 
    5761            9 :         XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
    5762            9 :         XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
    5763              : 
    5764            9 :         if (!XLogArchiveIsReadyOrDone(origfname))
    5765              :         {
    5766              :             char        origpath[MAXPGPATH];
    5767              :             char        partialfname[MAXFNAMELEN];
    5768              :             char        partialpath[MAXPGPATH];
    5769              : 
    5770              :             /*
    5771              :              * If we're summarizing WAL, we can't rename the partial file
    5772              :              * until the summarizer finishes with it, else it will fail.
    5773              :              */
    5774            5 :             if (summarize_wal)
    5775            1 :                 WaitForWalSummarization(EndOfLog);
    5776              : 
    5777            5 :             XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
    5778            5 :             snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
    5779            5 :             snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
    5780              : 
    5781              :             /*
    5782              :              * Make sure there's no .done or .ready file for the .partial
    5783              :              * file.
    5784              :              */
    5785            5 :             XLogArchiveCleanup(partialfname);
    5786              : 
    5787            5 :             durable_rename(origpath, partialpath, ERROR);
    5788            5 :             XLogArchiveNotify(partialfname);
    5789              :         }
    5790              :     }
    5791           55 : }
    5792              : 
    5793              : /*
    5794              :  * Check to see if required parameters are set high enough on this server
    5795              :  * for various aspects of recovery operation.
    5796              :  *
    5797              :  * Note that all the parameters which this function tests need to be
    5798              :  * listed in Administrator's Overview section in high-availability.sgml.
    5799              :  * If you change them, don't forget to update the list.
    5800              :  */
    5801              : static void
    5802          265 : CheckRequiredParameterValues(void)
    5803              : {
    5804              :     /*
    5805              :      * For archive recovery, the WAL must be generated with at least 'replica'
    5806              :      * wal_level.
    5807              :      */
    5808          265 :     if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
    5809              :     {
    5810            2 :         ereport(FATAL,
    5811              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    5812              :                  errmsg("WAL was generated with \"wal_level=minimal\", cannot continue recovering"),
    5813              :                  errdetail("This happens if you temporarily set \"wal_level=minimal\" on the server."),
    5814              :                  errhint("Use a backup taken after setting \"wal_level\" to higher than \"minimal\".")));
    5815              :     }
    5816              : 
    5817              :     /*
    5818              :      * For Hot Standby, the WAL must be generated with 'replica' mode, and we
    5819              :      * must have at least as many backend slots as the primary.
    5820              :      */
    5821          263 :     if (ArchiveRecoveryRequested && EnableHotStandby)
    5822              :     {
    5823              :         /* We ignore autovacuum_worker_slots when we make this test. */
    5824          139 :         RecoveryRequiresIntParameter("max_connections",
    5825              :                                      MaxConnections,
    5826          139 :                                      ControlFile->MaxConnections);
    5827          139 :         RecoveryRequiresIntParameter("max_worker_processes",
    5828              :                                      max_worker_processes,
    5829          139 :                                      ControlFile->max_worker_processes);
    5830          139 :         RecoveryRequiresIntParameter("max_wal_senders",
    5831              :                                      max_wal_senders,
    5832          139 :                                      ControlFile->max_wal_senders);
    5833          139 :         RecoveryRequiresIntParameter("max_prepared_transactions",
    5834              :                                      max_prepared_xacts,
    5835          139 :                                      ControlFile->max_prepared_xacts);
    5836          139 :         RecoveryRequiresIntParameter("max_locks_per_transaction",
    5837              :                                      max_locks_per_xact,
    5838          139 :                                      ControlFile->max_locks_per_xact);
    5839              :     }
    5840          263 : }
    5841              : 
    5842              : /*
    5843              :  * This must be called ONCE during postmaster or standalone-backend startup
    5844              :  */
    5845              : void
    5846         1074 : StartupXLOG(void)
    5847              : {
    5848              :     XLogCtlInsert *Insert;
    5849              :     CheckPoint  checkPoint;
    5850              :     bool        wasShutdown;
    5851              :     bool        didCrash;
    5852              :     bool        haveTblspcMap;
    5853              :     bool        haveBackupLabel;
    5854              :     XLogRecPtr  EndOfLog;
    5855              :     TimeLineID  EndOfLogTLI;
    5856              :     TimeLineID  newTLI;
    5857              :     bool        performedWalRecovery;
    5858              :     EndOfWalRecoveryInfo *endOfRecoveryInfo;
    5859              :     XLogRecPtr  abortedRecPtr;
    5860              :     XLogRecPtr  missingContrecPtr;
    5861              :     TransactionId oldestActiveXID;
    5862         1074 :     bool        promoted = false;
    5863              :     char        timebuf[128];
    5864              : 
    5865              :     /*
    5866              :      * We should have an aux process resource owner to use, and we should not
    5867              :      * be in a transaction that's installed some other resowner.
    5868              :      */
    5869              :     Assert(AuxProcessResourceOwner != NULL);
    5870              :     Assert(CurrentResourceOwner == NULL ||
    5871              :            CurrentResourceOwner == AuxProcessResourceOwner);
    5872         1074 :     CurrentResourceOwner = AuxProcessResourceOwner;
    5873              : 
    5874              :     /*
    5875              :      * Check that contents look valid.
    5876              :      */
    5877         1074 :     if (!XRecOffIsValid(ControlFile->checkPoint))
    5878            0 :         ereport(FATAL,
    5879              :                 (errcode(ERRCODE_DATA_CORRUPTED),
    5880              :                  errmsg("control file contains invalid checkpoint location")));
    5881              : 
    5882         1074 :     switch (ControlFile->state)
    5883              :     {
    5884          853 :         case DB_SHUTDOWNED:
    5885              : 
    5886              :             /*
    5887              :              * This is the expected case, so don't be chatty in standalone
    5888              :              * mode
    5889              :              */
    5890          853 :             ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    5891              :                     (errmsg("database system was shut down at %s",
    5892              :                             str_time(ControlFile->time,
    5893              :                                      timebuf, sizeof(timebuf)))));
    5894          853 :             break;
    5895              : 
    5896           33 :         case DB_SHUTDOWNED_IN_RECOVERY:
    5897           33 :             ereport(LOG,
    5898              :                     (errmsg("database system was shut down in recovery at %s",
    5899              :                             str_time(ControlFile->time,
    5900              :                                      timebuf, sizeof(timebuf)))));
    5901           33 :             break;
    5902              : 
    5903            0 :         case DB_SHUTDOWNING:
    5904            0 :             ereport(LOG,
    5905              :                     (errmsg("database system shutdown was interrupted; last known up at %s",
    5906              :                             str_time(ControlFile->time,
    5907              :                                      timebuf, sizeof(timebuf)))));
    5908            0 :             break;
    5909              : 
    5910            0 :         case DB_IN_CRASH_RECOVERY:
    5911            0 :             ereport(LOG,
    5912              :                     (errmsg("database system was interrupted while in recovery at %s",
    5913              :                             str_time(ControlFile->time,
    5914              :                                      timebuf, sizeof(timebuf))),
    5915              :                      errhint("This probably means that some data is corrupted and"
    5916              :                              " you will have to use the last backup for recovery.")));
    5917            0 :             break;
    5918              : 
    5919            7 :         case DB_IN_ARCHIVE_RECOVERY:
    5920            7 :             ereport(LOG,
    5921              :                     (errmsg("database system was interrupted while in recovery at log time %s",
    5922              :                             str_time(ControlFile->checkPointCopy.time,
    5923              :                                      timebuf, sizeof(timebuf))),
    5924              :                      errhint("If this has occurred more than once some data might be corrupted"
    5925              :                              " and you might need to choose an earlier recovery target.")));
    5926            7 :             break;
    5927              : 
    5928          181 :         case DB_IN_PRODUCTION:
    5929          181 :             ereport(LOG,
    5930              :                     (errmsg("database system was interrupted; last known up at %s",
    5931              :                             str_time(ControlFile->time,
    5932              :                                      timebuf, sizeof(timebuf)))));
    5933          181 :             break;
    5934              : 
    5935            0 :         default:
    5936            0 :             ereport(FATAL,
    5937              :                     (errcode(ERRCODE_DATA_CORRUPTED),
    5938              :                      errmsg("control file contains invalid database cluster state")));
    5939              :     }
    5940              : 
    5941              :     /* This is just to allow attaching to startup process with a debugger */
    5942              : #ifdef XLOG_REPLAY_DELAY
    5943              :     if (ControlFile->state != DB_SHUTDOWNED)
    5944              :         pg_usleep(60000000L);
    5945              : #endif
    5946              : 
    5947              :     /*
    5948              :      * Verify that pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
    5949              :      * In cases where someone has performed a copy for PITR, these directories
    5950              :      * may have been excluded and need to be re-created.
    5951              :      */
    5952         1074 :     ValidateXLOGDirectoryStructure();
    5953              : 
    5954              :     /* Set up timeout handler needed to report startup progress. */
    5955         1074 :     if (!IsBootstrapProcessingMode())
    5956         1017 :         RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
    5957              :                         startup_progress_timeout_handler);
    5958              : 
    5959              :     /*----------
    5960              :      * If we previously crashed, perform a couple of actions:
    5961              :      *
    5962              :      * - The pg_wal directory may still include some temporary WAL segments
    5963              :      *   used when creating a new segment, so perform some clean up to not
    5964              :      *   bloat this path.  This is done first as there is no point to sync
    5965              :      *   this temporary data.
    5966              :      *
    5967              :      * - There might be data which we had written, intending to fsync it, but
    5968              :      *   which we had not actually fsync'd yet.  Therefore, a power failure in
    5969              :      *   the near future might cause earlier unflushed writes to be lost, even
    5970              :      *   though more recent data written to disk from here on would be
    5971              :      *   persisted.  To avoid that, fsync the entire data directory.
    5972              :      */
    5973         1074 :     if (ControlFile->state != DB_SHUTDOWNED &&
    5974          221 :         ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
    5975              :     {
    5976          188 :         RemoveTempXlogFiles();
    5977          188 :         SyncDataDirectory();
    5978          188 :         didCrash = true;
    5979              :     }
    5980              :     else
    5981          886 :         didCrash = false;
    5982              : 
    5983              :     /*
    5984              :      * Prepare for WAL recovery if needed.
    5985              :      *
    5986              :      * InitWalRecovery analyzes the control file and the backup label file, if
    5987              :      * any.  It updates the in-memory ControlFile buffer according to the
    5988              :      * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
    5989              :      * It also applies the tablespace map file, if any.
    5990              :      */
    5991         1074 :     InitWalRecovery(ControlFile, &wasShutdown,
    5992              :                     &haveBackupLabel, &haveTblspcMap);
    5993         1072 :     checkPoint = ControlFile->checkPointCopy;
    5994              : 
    5995              :     /* initialize shared memory variables from the checkpoint record */
    5996         1072 :     TransamVariables->nextXid = checkPoint.nextXid;
    5997         1072 :     TransamVariables->nextOid = checkPoint.nextOid;
    5998         1072 :     TransamVariables->oidCount = 0;
    5999         1072 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    6000         1072 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    6001         1072 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    6002         1072 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
    6003         1072 :     SetCommitTsLimit(checkPoint.oldestCommitTsXid,
    6004              :                      checkPoint.newestCommitTsXid);
    6005              : 
    6006              :     /*
    6007              :      * Clear out any old relcache cache files.  This is *necessary* if we do
    6008              :      * any WAL replay, since that would probably result in the cache files
    6009              :      * being out of sync with database reality.  In theory we could leave them
    6010              :      * in place if the database had been cleanly shut down, but it seems
    6011              :      * safest to just remove them always and let them be rebuilt during the
    6012              :      * first backend startup.  These files needs to be removed from all
    6013              :      * directories including pg_tblspc, however the symlinks are created only
    6014              :      * after reading tablespace_map file in case of archive recovery from
    6015              :      * backup, so needs to clear old relcache files here after creating
    6016              :      * symlinks.
    6017              :      */
    6018         1072 :     RelationCacheInitFileRemove();
    6019              : 
    6020              :     /*
    6021              :      * Initialize replication slots, before there's a chance to remove
    6022              :      * required resources.
    6023              :      */
    6024         1072 :     StartupReplicationSlots();
    6025              : 
    6026              :     /*
    6027              :      * Startup the logical decoding status with the last status stored in the
    6028              :      * checkpoint record.
    6029              :      */
    6030         1070 :     StartupLogicalDecodingStatus(checkPoint.logicalDecodingEnabled);
    6031              : 
    6032              :     /*
    6033              :      * Startup logical state, needs to be setup now so we have proper data
    6034              :      * during crash recovery.
    6035              :      */
    6036         1070 :     StartupReorderBuffer();
    6037              : 
    6038              :     /*
    6039              :      * Startup CLOG. This must be done after TransamVariables->nextXid has
    6040              :      * been initialized and before we accept connections or begin WAL replay.
    6041              :      */
    6042         1070 :     StartupCLOG();
    6043              : 
    6044              :     /*
    6045              :      * Startup MultiXact. We need to do this early to be able to replay
    6046              :      * truncations.
    6047              :      */
    6048         1070 :     StartupMultiXact();
    6049              : 
    6050              :     /*
    6051              :      * Ditto for commit timestamps.  Activate the facility if the setting is
    6052              :      * enabled in the control file, as there should be no tracking of commit
    6053              :      * timestamps done when the setting was disabled.  This facility can be
    6054              :      * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
    6055              :      */
    6056         1070 :     if (ControlFile->track_commit_timestamp)
    6057           14 :         StartupCommitTs();
    6058              : 
    6059              :     /*
    6060              :      * Recover knowledge about replay progress of known replication partners.
    6061              :      */
    6062         1070 :     StartupReplicationOrigin();
    6063              : 
    6064              :     /*
    6065              :      * Initialize unlogged LSN. On a clean shutdown, it's restored from the
    6066              :      * control file. On recovery, all unlogged relations are blown away, so
    6067              :      * the unlogged LSN counter can be reset too.
    6068              :      */
    6069         1070 :     if (ControlFile->state == DB_SHUTDOWNED)
    6070          844 :         pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
    6071          844 :                                        ControlFile->unloggedLSN);
    6072              :     else
    6073          226 :         pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
    6074              :                                        FirstNormalUnloggedLSN);
    6075              : 
    6076              :     /*
    6077              :      * Copy any missing timeline history files between 'now' and the recovery
    6078              :      * target timeline from archive to pg_wal. While we don't need those files
    6079              :      * ourselves - the history file of the recovery target timeline covers all
    6080              :      * the previous timelines in the history too - a cascading standby server
    6081              :      * might be interested in them. Or, if you archive the WAL from this
    6082              :      * server to a different archive than the primary, it'd be good for all
    6083              :      * the history files to get archived there after failover, so that you can
    6084              :      * use one of the old timelines as a PITR target. Timeline history files
    6085              :      * are small, so it's better to copy them unnecessarily than not copy them
    6086              :      * and regret later.
    6087              :      */
    6088         1070 :     restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
    6089              : 
    6090              :     /*
    6091              :      * Before running in recovery, scan pg_twophase and fill in its status to
    6092              :      * be able to work on entries generated by redo.  Doing a scan before
    6093              :      * taking any recovery action has the merit to discard any 2PC files that
    6094              :      * are newer than the first record to replay, saving from any conflicts at
    6095              :      * replay.  This avoids as well any subsequent scans when doing recovery
    6096              :      * of the on-disk two-phase data.
    6097              :      */
    6098         1070 :     restoreTwoPhaseData();
    6099              : 
    6100              :     /*
    6101              :      * When starting with crash recovery, reset pgstat data - it might not be
    6102              :      * valid. Otherwise restore pgstat data. It's safe to do this here,
    6103              :      * because postmaster will not yet have started any other processes.
    6104              :      *
    6105              :      * NB: Restoring replication slot stats relies on slot state to have
    6106              :      * already been restored from disk.
    6107              :      *
    6108              :      * TODO: With a bit of extra work we could just start with a pgstat file
    6109              :      * associated with the checkpoint redo location we're starting from.
    6110              :      */
    6111         1070 :     if (didCrash)
    6112          186 :         pgstat_discard_stats();
    6113              :     else
    6114          884 :         pgstat_restore_stats();
    6115              : 
    6116         1070 :     lastFullPageWrites = checkPoint.fullPageWrites;
    6117              : 
    6118         1070 :     RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    6119         1070 :     doPageWrites = lastFullPageWrites;
    6120              : 
    6121              :     /* REDO */
    6122         1070 :     if (InRecovery)
    6123              :     {
    6124              :         /* Initialize state for RecoveryInProgress() */
    6125          226 :         SpinLockAcquire(&XLogCtl->info_lck);
    6126          226 :         if (InArchiveRecovery)
    6127          124 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
    6128              :         else
    6129          102 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
    6130          226 :         SpinLockRelease(&XLogCtl->info_lck);
    6131              : 
    6132              :         /*
    6133              :          * Update pg_control to show that we are recovering and to show the
    6134              :          * selected checkpoint as the place we are starting from. We also mark
    6135              :          * pg_control with any minimum recovery stop point obtained from a
    6136              :          * backup history file.
    6137              :          *
    6138              :          * No need to hold ControlFileLock yet, we aren't up far enough.
    6139              :          */
    6140          226 :         UpdateControlFile();
    6141              : 
    6142              :         /*
    6143              :          * If there was a backup label file, it's done its job and the info
    6144              :          * has now been propagated into pg_control.  We must get rid of the
    6145              :          * label file so that if we crash during recovery, we'll pick up at
    6146              :          * the latest recovery restartpoint instead of going all the way back
    6147              :          * to the backup start point.  It seems prudent though to just rename
    6148              :          * the file out of the way rather than delete it completely.
    6149              :          */
    6150          226 :         if (haveBackupLabel)
    6151              :         {
    6152           82 :             unlink(BACKUP_LABEL_OLD);
    6153           82 :             durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
    6154              :         }
    6155              : 
    6156              :         /*
    6157              :          * If there was a tablespace_map file, it's done its job and the
    6158              :          * symlinks have been created.  We must get rid of the map file so
    6159              :          * that if we crash during recovery, we don't create symlinks again.
    6160              :          * It seems prudent though to just rename the file out of the way
    6161              :          * rather than delete it completely.
    6162              :          */
    6163          226 :         if (haveTblspcMap)
    6164              :         {
    6165            2 :             unlink(TABLESPACE_MAP_OLD);
    6166            2 :             durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
    6167              :         }
    6168              : 
    6169              :         /*
    6170              :          * Initialize our local copy of minRecoveryPoint.  When doing crash
    6171              :          * recovery we want to replay up to the end of WAL.  Particularly, in
    6172              :          * the case of a promoted standby minRecoveryPoint value in the
    6173              :          * control file is only updated after the first checkpoint.  However,
    6174              :          * if the instance crashes before the first post-recovery checkpoint
    6175              :          * is completed then recovery will use a stale location causing the
    6176              :          * startup process to think that there are still invalid page
    6177              :          * references when checking for data consistency.
    6178              :          */
    6179          226 :         if (InArchiveRecovery)
    6180              :         {
    6181          124 :             LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    6182          124 :             LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    6183              :         }
    6184              :         else
    6185              :         {
    6186          102 :             LocalMinRecoveryPoint = InvalidXLogRecPtr;
    6187          102 :             LocalMinRecoveryPointTLI = 0;
    6188              :         }
    6189              : 
    6190              :         /* Check that the GUCs used to generate the WAL allow recovery */
    6191          226 :         CheckRequiredParameterValues();
    6192              : 
    6193              :         /*
    6194              :          * We're in recovery, so unlogged relations may be trashed and must be
    6195              :          * reset.  This should be done BEFORE allowing Hot Standby
    6196              :          * connections, so that read-only backends don't try to read whatever
    6197              :          * garbage is left over from before.
    6198              :          */
    6199          226 :         ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
    6200              : 
    6201              :         /*
    6202              :          * Likewise, delete any saved transaction snapshot files that got left
    6203              :          * behind by crashed backends.
    6204              :          */
    6205          226 :         DeleteAllExportedSnapshotFiles();
    6206              : 
    6207              :         /*
    6208              :          * Initialize for Hot Standby, if enabled. We won't let backends in
    6209              :          * yet, not until we've reached the min recovery point specified in
    6210              :          * control file and we've established a recovery snapshot from a
    6211              :          * running-xacts WAL record.
    6212              :          */
    6213          226 :         if (ArchiveRecoveryRequested && EnableHotStandby)
    6214              :         {
    6215              :             TransactionId *xids;
    6216              :             int         nxids;
    6217              : 
    6218          116 :             ereport(DEBUG1,
    6219              :                     (errmsg_internal("initializing for hot standby")));
    6220              : 
    6221          116 :             InitRecoveryTransactionEnvironment();
    6222              : 
    6223          116 :             if (wasShutdown)
    6224           26 :                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    6225              :             else
    6226           90 :                 oldestActiveXID = checkPoint.oldestActiveXid;
    6227              :             Assert(TransactionIdIsValid(oldestActiveXID));
    6228              : 
    6229              :             /* Tell procarray about the range of xids it has to deal with */
    6230          116 :             ProcArrayInitRecovery(XidFromFullTransactionId(TransamVariables->nextXid));
    6231              : 
    6232              :             /*
    6233              :              * Startup subtrans only.  CLOG, MultiXact and commit timestamp
    6234              :              * have already been started up and other SLRUs are not maintained
    6235              :              * during recovery and need not be started yet.
    6236              :              */
    6237          116 :             StartupSUBTRANS(oldestActiveXID);
    6238              : 
    6239              :             /*
    6240              :              * If we're beginning at a shutdown checkpoint, we know that
    6241              :              * nothing was running on the primary at this point. So fake-up an
    6242              :              * empty running-xacts record and use that here and now. Recover
    6243              :              * additional standby state for prepared transactions.
    6244              :              */
    6245          116 :             if (wasShutdown)
    6246              :             {
    6247              :                 RunningTransactionsData running;
    6248              :                 TransactionId latestCompletedXid;
    6249              : 
    6250              :                 /* Update pg_subtrans entries for any prepared transactions */
    6251           26 :                 StandbyRecoverPreparedTransactions();
    6252              : 
    6253              :                 /*
    6254              :                  * Construct a RunningTransactions snapshot representing a
    6255              :                  * shut down server, with only prepared transactions still
    6256              :                  * alive. We're never overflowed at this point because all
    6257              :                  * subxids are listed with their parent prepared transactions.
    6258              :                  */
    6259           26 :                 running.xcnt = nxids;
    6260           26 :                 running.subxcnt = 0;
    6261           26 :                 running.subxid_status = SUBXIDS_IN_SUBTRANS;
    6262           26 :                 running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
    6263           26 :                 running.oldestRunningXid = oldestActiveXID;
    6264           26 :                 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
    6265           26 :                 TransactionIdRetreat(latestCompletedXid);
    6266              :                 Assert(TransactionIdIsNormal(latestCompletedXid));
    6267           26 :                 running.latestCompletedXid = latestCompletedXid;
    6268           26 :                 running.xids = xids;
    6269              : 
    6270           26 :                 ProcArrayApplyRecoveryInfo(&running);
    6271              :             }
    6272              :         }
    6273              : 
    6274              :         /*
    6275              :          * We're all set for replaying the WAL now. Do it.
    6276              :          */
    6277          226 :         PerformWalRecovery();
    6278          163 :         performedWalRecovery = true;
    6279              :     }
    6280              :     else
    6281          844 :         performedWalRecovery = false;
    6282              : 
    6283              :     /*
    6284              :      * Finish WAL recovery.
    6285              :      */
    6286         1007 :     endOfRecoveryInfo = FinishWalRecovery();
    6287         1007 :     EndOfLog = endOfRecoveryInfo->endOfLog;
    6288         1007 :     EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
    6289         1007 :     abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
    6290         1007 :     missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
    6291              : 
    6292              :     /*
    6293              :      * Reset ps status display, so as no information related to recovery shows
    6294              :      * up.
    6295              :      */
    6296         1007 :     set_ps_display("");
    6297              : 
    6298              :     /*
    6299              :      * When recovering from a backup (we are in recovery, and archive recovery
    6300              :      * was requested), complain if we did not roll forward far enough to reach
    6301              :      * the point where the database is consistent.  For regular online
    6302              :      * backup-from-primary, that means reaching the end-of-backup WAL record
    6303              :      * (at which point we reset backupStartPoint to be Invalid), for
    6304              :      * backup-from-replica (which can't inject records into the WAL stream),
    6305              :      * that point is when we reach the minRecoveryPoint in pg_control (which
    6306              :      * we purposefully copy last when backing up from a replica).  For
    6307              :      * pg_rewind (which creates a backup_label with a method of "pg_rewind")
    6308              :      * or snapshot-style backups (which don't), backupEndRequired will be set
    6309              :      * to false.
    6310              :      *
    6311              :      * Note: it is indeed okay to look at the local variable
    6312              :      * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
    6313              :      * might be further ahead --- ControlFile->minRecoveryPoint cannot have
    6314              :      * been advanced beyond the WAL we processed.
    6315              :      */
    6316         1007 :     if (InRecovery &&
    6317          163 :         (EndOfLog < LocalMinRecoveryPoint ||
    6318          163 :          XLogRecPtrIsValid(ControlFile->backupStartPoint)))
    6319              :     {
    6320              :         /*
    6321              :          * Ran off end of WAL before reaching end-of-backup WAL record, or
    6322              :          * minRecoveryPoint. That's a bad sign, indicating that you tried to
    6323              :          * recover from an online backup but never called pg_backup_stop(), or
    6324              :          * you didn't archive all the WAL needed.
    6325              :          */
    6326            0 :         if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
    6327              :         {
    6328            0 :             if (XLogRecPtrIsValid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired)
    6329            0 :                 ereport(FATAL,
    6330              :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6331              :                          errmsg("WAL ends before end of online backup"),
    6332              :                          errhint("All WAL generated while online backup was taken must be available at recovery.")));
    6333              :             else
    6334            0 :                 ereport(FATAL,
    6335              :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6336              :                          errmsg("WAL ends before consistent recovery point")));
    6337              :         }
    6338              :     }
    6339              : 
    6340              :     /*
    6341              :      * Reset unlogged relations to the contents of their INIT fork. This is
    6342              :      * done AFTER recovery is complete so as to include any unlogged relations
    6343              :      * created during recovery, but BEFORE recovery is marked as having
    6344              :      * completed successfully. Otherwise we'd not retry if any of the post
    6345              :      * end-of-recovery steps fail.
    6346              :      */
    6347         1007 :     if (InRecovery)
    6348          163 :         ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
    6349              : 
    6350              :     /*
    6351              :      * Pre-scan prepared transactions to find out the range of XIDs present.
    6352              :      * This information is not quite needed yet, but it is positioned here so
    6353              :      * as potential problems are detected before any on-disk change is done.
    6354              :      */
    6355         1007 :     oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
    6356              : 
    6357              :     /*
    6358              :      * Allow ordinary WAL segment creation before possibly switching to a new
    6359              :      * timeline, which creates a new segment, and after the last ReadRecord().
    6360              :      */
    6361         1007 :     SetInstallXLogFileSegmentActive();
    6362              : 
    6363              :     /*
    6364              :      * Consider whether we need to assign a new timeline ID.
    6365              :      *
    6366              :      * If we did archive recovery, we always assign a new ID.  This handles a
    6367              :      * couple of issues.  If we stopped short of the end of WAL during
    6368              :      * recovery, then we are clearly generating a new timeline and must assign
    6369              :      * it a unique new ID.  Even if we ran to the end, modifying the current
    6370              :      * last segment is problematic because it may result in trying to
    6371              :      * overwrite an already-archived copy of that segment, and we encourage
    6372              :      * DBAs to make their archive_commands reject that.  We can dodge the
    6373              :      * problem by making the new active segment have a new timeline ID.
    6374              :      *
    6375              :      * In a normal crash recovery, we can just extend the timeline we were in.
    6376              :      */
    6377         1007 :     newTLI = endOfRecoveryInfo->lastRecTLI;
    6378         1007 :     if (ArchiveRecoveryRequested)
    6379              :     {
    6380           55 :         newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
    6381           55 :         ereport(LOG,
    6382              :                 (errmsg("selected new timeline ID: %u", newTLI)));
    6383              : 
    6384              :         /*
    6385              :          * Make a writable copy of the last WAL segment.  (Note that we also
    6386              :          * have a copy of the last block of the old WAL in
    6387              :          * endOfRecovery->lastPage; we will use that below.)
    6388              :          */
    6389           55 :         XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
    6390              : 
    6391              :         /*
    6392              :          * Remove the signal files out of the way, so that we don't
    6393              :          * accidentally re-enter archive recovery mode in a subsequent crash.
    6394              :          */
    6395           55 :         if (endOfRecoveryInfo->standby_signal_file_found)
    6396           52 :             durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
    6397              : 
    6398           55 :         if (endOfRecoveryInfo->recovery_signal_file_found)
    6399            4 :             durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
    6400              : 
    6401              :         /*
    6402              :          * Write the timeline history file, and have it archived. After this
    6403              :          * point (or rather, as soon as the file is archived), the timeline
    6404              :          * will appear as "taken" in the WAL archive and to any standby
    6405              :          * servers.  If we crash before actually switching to the new
    6406              :          * timeline, standby servers will nevertheless think that we switched
    6407              :          * to the new timeline, and will try to connect to the new timeline.
    6408              :          * To minimize the window for that, try to do as little as possible
    6409              :          * between here and writing the end-of-recovery record.
    6410              :          */
    6411           55 :         writeTimeLineHistory(newTLI, recoveryTargetTLI,
    6412              :                              EndOfLog, endOfRecoveryInfo->recoveryStopReason);
    6413              : 
    6414           55 :         ereport(LOG,
    6415              :                 (errmsg("archive recovery complete")));
    6416              :     }
    6417              : 
    6418              :     /* Save the selected TimeLineID in shared memory, too */
    6419         1007 :     SpinLockAcquire(&XLogCtl->info_lck);
    6420         1007 :     XLogCtl->InsertTimeLineID = newTLI;
    6421         1007 :     XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
    6422         1007 :     SpinLockRelease(&XLogCtl->info_lck);
    6423              : 
    6424              :     /*
    6425              :      * Actually, if WAL ended in an incomplete record, skip the parts that
    6426              :      * made it through and start writing after the portion that persisted.
    6427              :      * (It's critical to first write an OVERWRITE_CONTRECORD message, which
    6428              :      * we'll do as soon as we're open for writing new WAL.)
    6429              :      */
    6430         1007 :     if (XLogRecPtrIsValid(missingContrecPtr))
    6431              :     {
    6432              :         /*
    6433              :          * We should only have a missingContrecPtr if we're not switching to a
    6434              :          * new timeline. When a timeline switch occurs, WAL is copied from the
    6435              :          * old timeline to the new only up to the end of the last complete
    6436              :          * record, so there can't be an incomplete WAL record that we need to
    6437              :          * disregard.
    6438              :          */
    6439              :         Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
    6440              :         Assert(XLogRecPtrIsValid(abortedRecPtr));
    6441           10 :         EndOfLog = missingContrecPtr;
    6442              :     }
    6443              : 
    6444              :     /*
    6445              :      * Prepare to write WAL starting at EndOfLog location, and init xlog
    6446              :      * buffer cache using the block containing the last record from the
    6447              :      * previous incarnation.
    6448              :      */
    6449         1007 :     Insert = &XLogCtl->Insert;
    6450         1007 :     Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
    6451         1007 :     Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
    6452              : 
    6453              :     /*
    6454              :      * Tricky point here: lastPage contains the *last* block that the LastRec
    6455              :      * record spans, not the one it starts in.  The last block is indeed the
    6456              :      * one we want to use.
    6457              :      */
    6458         1007 :     if (EndOfLog % XLOG_BLCKSZ != 0)
    6459              :     {
    6460              :         char       *page;
    6461              :         int         len;
    6462              :         int         firstIdx;
    6463              : 
    6464          975 :         firstIdx = XLogRecPtrToBufIdx(EndOfLog);
    6465          975 :         len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
    6466              :         Assert(len < XLOG_BLCKSZ);
    6467              : 
    6468              :         /* Copy the valid part of the last block, and zero the rest */
    6469          975 :         page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
    6470          975 :         memcpy(page, endOfRecoveryInfo->lastPage, len);
    6471          975 :         memset(page + len, 0, XLOG_BLCKSZ - len);
    6472              : 
    6473          975 :         pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
    6474          975 :         XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
    6475              :     }
    6476              :     else
    6477              :     {
    6478              :         /*
    6479              :          * There is no partial block to copy. Just set InitializedUpTo, and
    6480              :          * let the first attempt to insert a log record to initialize the next
    6481              :          * buffer.
    6482              :          */
    6483           32 :         XLogCtl->InitializedUpTo = EndOfLog;
    6484              :     }
    6485              : 
    6486              :     /*
    6487              :      * Update local and shared status.  This is OK to do without any locks
    6488              :      * because no other process can be reading or writing WAL yet.
    6489              :      */
    6490         1007 :     LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
    6491         1007 :     pg_atomic_write_u64(&XLogCtl->logInsertResult, EndOfLog);
    6492         1007 :     pg_atomic_write_u64(&XLogCtl->logWriteResult, EndOfLog);
    6493         1007 :     pg_atomic_write_u64(&XLogCtl->logFlushResult, EndOfLog);
    6494         1007 :     XLogCtl->LogwrtRqst.Write = EndOfLog;
    6495         1007 :     XLogCtl->LogwrtRqst.Flush = EndOfLog;
    6496              : 
    6497              :     /*
    6498              :      * Preallocate additional log files, if wanted.
    6499              :      */
    6500         1007 :     PreallocXlogFiles(EndOfLog, newTLI);
    6501              : 
    6502              :     /*
    6503              :      * Okay, we're officially UP.
    6504              :      */
    6505         1007 :     InRecovery = false;
    6506              : 
    6507              :     /* start the archive_timeout timer and LSN running */
    6508         1007 :     XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    6509         1007 :     XLogCtl->lastSegSwitchLSN = EndOfLog;
    6510              : 
    6511              :     /* also initialize latestCompletedXid, to nextXid - 1 */
    6512         1007 :     LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
    6513         1007 :     TransamVariables->latestCompletedXid = TransamVariables->nextXid;
    6514         1007 :     FullTransactionIdRetreat(&TransamVariables->latestCompletedXid);
    6515         1007 :     LWLockRelease(ProcArrayLock);
    6516              : 
    6517              :     /*
    6518              :      * Start up subtrans, if not already done for hot standby.  (commit
    6519              :      * timestamps are started below, if necessary.)
    6520              :      */
    6521         1007 :     if (standbyState == STANDBY_DISABLED)
    6522          952 :         StartupSUBTRANS(oldestActiveXID);
    6523              : 
    6524              :     /*
    6525              :      * Perform end of recovery actions for any SLRUs that need it.
    6526              :      */
    6527         1007 :     TrimCLOG();
    6528         1007 :     TrimMultiXact();
    6529              : 
    6530              :     /*
    6531              :      * Reload shared-memory state for prepared transactions.  This needs to
    6532              :      * happen before renaming the last partial segment of the old timeline as
    6533              :      * it may be possible that we have to recover some transactions from it.
    6534              :      */
    6535         1007 :     RecoverPreparedTransactions();
    6536              : 
    6537              :     /* Shut down xlogreader */
    6538         1007 :     ShutdownWalRecovery();
    6539              : 
    6540              :     /* Enable WAL writes for this backend only. */
    6541         1007 :     LocalSetXLogInsertAllowed();
    6542              : 
    6543              :     /* If necessary, write overwrite-contrecord before doing anything else */
    6544         1007 :     if (XLogRecPtrIsValid(abortedRecPtr))
    6545              :     {
    6546              :         Assert(XLogRecPtrIsValid(missingContrecPtr));
    6547           10 :         CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
    6548              :     }
    6549              : 
    6550              :     /*
    6551              :      * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
    6552              :      * record before resource manager writes cleanup WAL records or checkpoint
    6553              :      * record is written.
    6554              :      */
    6555         1007 :     Insert->fullPageWrites = lastFullPageWrites;
    6556         1007 :     UpdateFullPageWrites();
    6557              : 
    6558              :     /*
    6559              :      * Emit checkpoint or end-of-recovery record in XLOG, if required.
    6560              :      */
    6561         1007 :     if (performedWalRecovery)
    6562          163 :         promoted = PerformRecoveryXLogAction();
    6563              : 
    6564              :     /*
    6565              :      * If any of the critical GUCs have changed, log them before we allow
    6566              :      * backends to write WAL.
    6567              :      */
    6568         1007 :     XLogReportParameters();
    6569              : 
    6570              :     /* If this is archive recovery, perform post-recovery cleanup actions. */
    6571         1007 :     if (ArchiveRecoveryRequested)
    6572           55 :         CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
    6573              : 
    6574              :     /*
    6575              :      * Local WAL inserts enabled, so it's time to finish initialization of
    6576              :      * commit timestamp.
    6577              :      */
    6578         1007 :     CompleteCommitTsInitialization();
    6579              : 
    6580              :     /*
    6581              :      * Update logical decoding status in shared memory and write an
    6582              :      * XLOG_LOGICAL_DECODING_STATUS_CHANGE, if necessary.
    6583              :      */
    6584         1007 :     UpdateLogicalDecodingStatusEndOfRecovery();
    6585              : 
    6586              :     /* Clean up EndOfWalRecoveryInfo data to appease Valgrind leak checking */
    6587         1007 :     if (endOfRecoveryInfo->lastPage)
    6588          985 :         pfree(endOfRecoveryInfo->lastPage);
    6589         1007 :     pfree(endOfRecoveryInfo->recoveryStopReason);
    6590         1007 :     pfree(endOfRecoveryInfo);
    6591              : 
    6592              :     /*
    6593              :      * If we reach this point with checksums in the state inprogress-on, it
    6594              :      * means that data checksums were in the process of being enabled when the
    6595              :      * cluster shut down. Since processing didn't finish, the operation will
    6596              :      * have to be restarted from scratch since there is no capability to
    6597              :      * continue where it was when the cluster shut down. Thus, revert the
    6598              :      * state back to off, and inform the user with a warning message. Being
    6599              :      * able to restart processing is a TODO, but it wouldn't be possible to
    6600              :      * restart here since we cannot launch a dynamic background worker
    6601              :      * directly from here (it has to be from a regular backend).
    6602              :      */
    6603         1007 :     if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON)
    6604              :     {
    6605            0 :         XLogChecksums(PG_DATA_CHECKSUM_OFF);
    6606              : 
    6607            0 :         SpinLockAcquire(&XLogCtl->info_lck);
    6608            0 :         XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_OFF;
    6609            0 :         SetLocalDataChecksumState(XLogCtl->data_checksum_version);
    6610            0 :         SpinLockRelease(&XLogCtl->info_lck);
    6611              : 
    6612            0 :         ereport(WARNING,
    6613              :                 errmsg("enabling data checksums was interrupted"),
    6614              :                 errhint("Data checksum processing must be manually restarted for checksums to be enabled"));
    6615              :     }
    6616              : 
    6617              :     /*
    6618              :      * If data checksums were being disabled when the cluster was shut down,
    6619              :      * we know that we have a state where all backends have stopped validating
    6620              :      * checksums and we can move to off instead of prompting the user to
    6621              :      * perform any action.
    6622              :      */
    6623         1007 :     if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF)
    6624              :     {
    6625            0 :         XLogChecksums(PG_DATA_CHECKSUM_OFF);
    6626              : 
    6627            0 :         SpinLockAcquire(&XLogCtl->info_lck);
    6628            0 :         XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_OFF;
    6629            0 :         SetLocalDataChecksumState(XLogCtl->data_checksum_version);
    6630            0 :         SpinLockRelease(&XLogCtl->info_lck);
    6631              :     }
    6632              : 
    6633              :     /*
    6634              :      * All done with end-of-recovery actions.
    6635              :      *
    6636              :      * Now allow backends to write WAL and update the control file status in
    6637              :      * consequence.  SharedRecoveryState, that controls if backends can write
    6638              :      * WAL, is updated while holding ControlFileLock to prevent other backends
    6639              :      * to look at an inconsistent state of the control file in shared memory.
    6640              :      * There is still a small window during which backends can write WAL and
    6641              :      * the control file is still referring to a system not in DB_IN_PRODUCTION
    6642              :      * state while looking at the on-disk control file.
    6643              :      *
    6644              :      * Also, we use info_lck to update SharedRecoveryState to ensure that
    6645              :      * there are no race conditions concerning visibility of other recent
    6646              :      * updates to shared memory.
    6647              :      */
    6648         1007 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6649         1007 :     ControlFile->state = DB_IN_PRODUCTION;
    6650              : 
    6651         1007 :     SpinLockAcquire(&XLogCtl->info_lck);
    6652         1007 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
    6653         1007 :     SpinLockRelease(&XLogCtl->info_lck);
    6654              : 
    6655         1007 :     UpdateControlFile();
    6656         1007 :     LWLockRelease(ControlFileLock);
    6657              : 
    6658              :     /*
    6659              :      * Wake up the checkpointer process as there might be a request to disable
    6660              :      * logical decoding by concurrent slot drop.
    6661              :      */
    6662         1007 :     WakeupCheckpointer();
    6663              : 
    6664              :     /*
    6665              :      * Wake up all waiters.  They need to report an error that recovery was
    6666              :      * ended before reaching the target LSN.
    6667              :      */
    6668         1007 :     WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_REPLAY, InvalidXLogRecPtr);
    6669         1007 :     WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_WRITE, InvalidXLogRecPtr);
    6670         1007 :     WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_FLUSH, InvalidXLogRecPtr);
    6671              : 
    6672              :     /*
    6673              :      * Shutdown the recovery environment.  This must occur after
    6674              :      * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
    6675              :      * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
    6676              :      * any session building a snapshot will not rely on KnownAssignedXids as
    6677              :      * RecoveryInProgress() would return false at this stage.  This is
    6678              :      * particularly critical for prepared 2PC transactions, that would still
    6679              :      * need to be included in snapshots once recovery has ended.
    6680              :      */
    6681         1007 :     if (standbyState != STANDBY_DISABLED)
    6682           55 :         ShutdownRecoveryTransactionEnvironment();
    6683              : 
    6684              :     /*
    6685              :      * If there were cascading standby servers connected to us, nudge any wal
    6686              :      * sender processes to notice that we've been promoted.
    6687              :      */
    6688         1007 :     WalSndWakeup(true, true);
    6689              : 
    6690              :     /*
    6691              :      * If this was a promotion, request an (online) checkpoint now. This isn't
    6692              :      * required for consistency, but the last restartpoint might be far back,
    6693              :      * and in case of a crash, recovering from it might take a longer than is
    6694              :      * appropriate now that we're not in standby mode anymore.
    6695              :      */
    6696         1007 :     if (promoted)
    6697           48 :         RequestCheckpoint(CHECKPOINT_FORCE);
    6698         1007 : }
    6699              : 
    6700              : /*
    6701              :  * Callback from PerformWalRecovery(), called when we switch from crash
    6702              :  * recovery to archive recovery mode.  Updates the control file accordingly.
    6703              :  */
    6704              : void
    6705            1 : SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
    6706              : {
    6707              :     /* initialize minRecoveryPoint to this record */
    6708            1 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6709            1 :     ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
    6710            1 :     if (ControlFile->minRecoveryPoint < EndRecPtr)
    6711              :     {
    6712            1 :         ControlFile->minRecoveryPoint = EndRecPtr;
    6713            1 :         ControlFile->minRecoveryPointTLI = replayTLI;
    6714              :     }
    6715              :     /* update local copy */
    6716            1 :     LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    6717            1 :     LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    6718              : 
    6719              :     /*
    6720              :      * The startup process can update its local copy of minRecoveryPoint from
    6721              :      * this point.
    6722              :      */
    6723            1 :     updateMinRecoveryPoint = true;
    6724              : 
    6725            1 :     UpdateControlFile();
    6726              : 
    6727              :     /*
    6728              :      * We update SharedRecoveryState while holding the lock on ControlFileLock
    6729              :      * so both states are consistent in shared memory.
    6730              :      */
    6731            1 :     SpinLockAcquire(&XLogCtl->info_lck);
    6732            1 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
    6733            1 :     SpinLockRelease(&XLogCtl->info_lck);
    6734              : 
    6735            1 :     LWLockRelease(ControlFileLock);
    6736            1 : }
    6737              : 
    6738              : /*
    6739              :  * Callback from PerformWalRecovery(), called when we reach the end of backup.
    6740              :  * Updates the control file accordingly.
    6741              :  */
    6742              : void
    6743           82 : ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
    6744              : {
    6745              :     /*
    6746              :      * We have reached the end of base backup, as indicated by pg_control. The
    6747              :      * data on disk is now consistent (unless minRecoveryPoint is further
    6748              :      * ahead, which can happen if we crashed during previous recovery).  Reset
    6749              :      * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
    6750              :      * make sure we don't allow starting up at an earlier point even if
    6751              :      * recovery is stopped and restarted soon after this.
    6752              :      */
    6753           82 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6754              : 
    6755           82 :     if (ControlFile->minRecoveryPoint < EndRecPtr)
    6756              :     {
    6757           77 :         ControlFile->minRecoveryPoint = EndRecPtr;
    6758           77 :         ControlFile->minRecoveryPointTLI = tli;
    6759              :     }
    6760              : 
    6761           82 :     ControlFile->backupStartPoint = InvalidXLogRecPtr;
    6762           82 :     ControlFile->backupEndPoint = InvalidXLogRecPtr;
    6763           82 :     ControlFile->backupEndRequired = false;
    6764           82 :     UpdateControlFile();
    6765              : 
    6766           82 :     LWLockRelease(ControlFileLock);
    6767           82 : }
    6768              : 
    6769              : /*
    6770              :  * Perform whatever XLOG actions are necessary at end of REDO.
    6771              :  *
    6772              :  * The goal here is to make sure that we'll be able to recover properly if
    6773              :  * we crash again. If we choose to write a checkpoint, we'll write a shutdown
    6774              :  * checkpoint rather than an on-line one. This is not particularly critical,
    6775              :  * but since we may be assigning a new TLI, using a shutdown checkpoint allows
    6776              :  * us to have the rule that TLI only changes in shutdown checkpoints, which
    6777              :  * allows some extra error checking in xlog_redo.
    6778              :  */
    6779              : static bool
    6780          163 : PerformRecoveryXLogAction(void)
    6781              : {
    6782          163 :     bool        promoted = false;
    6783              : 
    6784              :     /*
    6785              :      * Perform a checkpoint to update all our recovery activity to disk.
    6786              :      *
    6787              :      * Note that we write a shutdown checkpoint rather than an on-line one.
    6788              :      * This is not particularly critical, but since we may be assigning a new
    6789              :      * TLI, using a shutdown checkpoint allows us to have the rule that TLI
    6790              :      * only changes in shutdown checkpoints, which allows some extra error
    6791              :      * checking in xlog_redo.
    6792              :      *
    6793              :      * In promotion, only create a lightweight end-of-recovery record instead
    6794              :      * of a full checkpoint. A checkpoint is requested later, after we're
    6795              :      * fully out of recovery mode and already accepting queries.
    6796              :      */
    6797          218 :     if (ArchiveRecoveryRequested && IsUnderPostmaster &&
    6798           55 :         PromoteIsTriggered())
    6799              :     {
    6800           48 :         promoted = true;
    6801              : 
    6802              :         /*
    6803              :          * Insert a special WAL record to mark the end of recovery, since we
    6804              :          * aren't doing a checkpoint. That means that the checkpointer process
    6805              :          * may likely be in the middle of a time-smoothed restartpoint and
    6806              :          * could continue to be for minutes after this.  That sounds strange,
    6807              :          * but the effect is roughly the same and it would be stranger to try
    6808              :          * to come out of the restartpoint and then checkpoint. We request a
    6809              :          * checkpoint later anyway, just for safety.
    6810              :          */
    6811           48 :         CreateEndOfRecoveryRecord();
    6812              :     }
    6813              :     else
    6814              :     {
    6815          115 :         RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
    6816              :                           CHECKPOINT_FAST |
    6817              :                           CHECKPOINT_WAIT);
    6818              :     }
    6819              : 
    6820          163 :     return promoted;
    6821              : }
    6822              : 
    6823              : /*
    6824              :  * Is the system still in recovery?
    6825              :  *
    6826              :  * Unlike testing InRecovery, this works in any process that's connected to
    6827              :  * shared memory.
    6828              :  */
    6829              : bool
    6830     96982476 : RecoveryInProgress(void)
    6831              : {
    6832              :     /*
    6833              :      * We check shared state each time only until we leave recovery mode. We
    6834              :      * can't re-enter recovery, so there's no need to keep checking after the
    6835              :      * shared variable has once been seen false.
    6836              :      */
    6837     96982476 :     if (!LocalRecoveryInProgress)
    6838     94930848 :         return false;
    6839              :     else
    6840              :     {
    6841              :         /*
    6842              :          * use volatile pointer to make sure we make a fresh read of the
    6843              :          * shared variable.
    6844              :          */
    6845      2051628 :         volatile XLogCtlData *xlogctl = XLogCtl;
    6846              : 
    6847      2051628 :         LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
    6848              : 
    6849              :         /*
    6850              :          * Note: We don't need a memory barrier when we're still in recovery.
    6851              :          * We might exit recovery immediately after return, so the caller
    6852              :          * can't rely on 'true' meaning that we're still in recovery anyway.
    6853              :          */
    6854              : 
    6855      2051628 :         return LocalRecoveryInProgress;
    6856              :     }
    6857              : }
    6858              : 
    6859              : /*
    6860              :  * Returns current recovery state from shared memory.
    6861              :  *
    6862              :  * This returned state is kept consistent with the contents of the control
    6863              :  * file.  See details about the possible values of RecoveryState in xlog.h.
    6864              :  */
    6865              : RecoveryState
    6866        10164 : GetRecoveryState(void)
    6867              : {
    6868              :     RecoveryState retval;
    6869              : 
    6870        10164 :     SpinLockAcquire(&XLogCtl->info_lck);
    6871        10164 :     retval = XLogCtl->SharedRecoveryState;
    6872        10164 :     SpinLockRelease(&XLogCtl->info_lck);
    6873              : 
    6874        10164 :     return retval;
    6875              : }
    6876              : 
    6877              : /*
    6878              :  * Is this process allowed to insert new WAL records?
    6879              :  *
    6880              :  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
    6881              :  * But we also have provisions for forcing the result "true" or "false"
    6882              :  * within specific processes regardless of the global state.
    6883              :  */
    6884              : bool
    6885     65985211 : XLogInsertAllowed(void)
    6886              : {
    6887              :     /*
    6888              :      * If value is "unconditionally true" or "unconditionally false", just
    6889              :      * return it.  This provides the normal fast path once recovery is known
    6890              :      * done.
    6891              :      */
    6892     65985211 :     if (LocalXLogInsertAllowed >= 0)
    6893     65326632 :         return (bool) LocalXLogInsertAllowed;
    6894              : 
    6895              :     /*
    6896              :      * Else, must check to see if we're still in recovery.
    6897              :      */
    6898       658579 :     if (RecoveryInProgress())
    6899       648112 :         return false;
    6900              : 
    6901              :     /*
    6902              :      * On exit from recovery, reset to "unconditionally true", since there is
    6903              :      * no need to keep checking.
    6904              :      */
    6905        10467 :     LocalXLogInsertAllowed = 1;
    6906        10467 :     return true;
    6907              : }
    6908              : 
    6909              : /*
    6910              :  * Make XLogInsertAllowed() return true in the current process only.
    6911              :  *
    6912              :  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
    6913              :  * and even call LocalSetXLogInsertAllowed() again after that.
    6914              :  *
    6915              :  * Returns the previous value of LocalXLogInsertAllowed.
    6916              :  */
    6917              : static int
    6918         1037 : LocalSetXLogInsertAllowed(void)
    6919              : {
    6920         1037 :     int         oldXLogAllowed = LocalXLogInsertAllowed;
    6921              : 
    6922         1037 :     LocalXLogInsertAllowed = 1;
    6923              : 
    6924         1037 :     return oldXLogAllowed;
    6925              : }
    6926              : 
    6927              : /*
    6928              :  * Return the current Redo pointer from shared memory.
    6929              :  *
    6930              :  * As a side-effect, the local RedoRecPtr copy is updated.
    6931              :  */
    6932              : XLogRecPtr
    6933       383179 : GetRedoRecPtr(void)
    6934              : {
    6935              :     XLogRecPtr  ptr;
    6936              : 
    6937              :     /*
    6938              :      * The possibly not up-to-date copy in XLogCtl is enough. Even if we
    6939              :      * grabbed a WAL insertion lock to read the authoritative value in
    6940              :      * Insert->RedoRecPtr, someone might update it just after we've released
    6941              :      * the lock.
    6942              :      */
    6943       383179 :     SpinLockAcquire(&XLogCtl->info_lck);
    6944       383179 :     ptr = XLogCtl->RedoRecPtr;
    6945       383179 :     SpinLockRelease(&XLogCtl->info_lck);
    6946              : 
    6947       383179 :     if (RedoRecPtr < ptr)
    6948         1745 :         RedoRecPtr = ptr;
    6949              : 
    6950       383179 :     return RedoRecPtr;
    6951              : }
    6952              : 
    6953              : /*
    6954              :  * Return information needed to decide whether a modified block needs a
    6955              :  * full-page image to be included in the WAL record.
    6956              :  *
    6957              :  * The returned values are cached copies from backend-private memory, and
    6958              :  * possibly out-of-date or, indeed, uninitialized, in which case they will
    6959              :  * be InvalidXLogRecPtr and false, respectively.  XLogInsertRecord will
    6960              :  * re-check them against up-to-date values, while holding the WAL insert lock.
    6961              :  */
    6962              : void
    6963     24314649 : GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
    6964              : {
    6965     24314649 :     *RedoRecPtr_p = RedoRecPtr;
    6966     24314649 :     *doPageWrites_p = doPageWrites;
    6967     24314649 : }
    6968              : 
    6969              : /*
    6970              :  * GetInsertRecPtr -- Returns the current insert position.
    6971              :  *
    6972              :  * NOTE: The value *actually* returned is the position of the last full
    6973              :  * xlog page. It lags behind the real insert position by at most 1 page.
    6974              :  * For that, we don't need to scan through WAL insertion locks, and an
    6975              :  * approximation is enough for the current usage of this function.
    6976              :  */
    6977              : XLogRecPtr
    6978         7263 : GetInsertRecPtr(void)
    6979              : {
    6980              :     XLogRecPtr  recptr;
    6981              : 
    6982         7263 :     SpinLockAcquire(&XLogCtl->info_lck);
    6983         7263 :     recptr = XLogCtl->LogwrtRqst.Write;
    6984         7263 :     SpinLockRelease(&XLogCtl->info_lck);
    6985              : 
    6986         7263 :     return recptr;
    6987              : }
    6988              : 
    6989              : /*
    6990              :  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
    6991              :  * position known to be fsync'd to disk. This should only be used on a
    6992              :  * system that is known not to be in recovery.
    6993              :  */
    6994              : XLogRecPtr
    6995       215850 : GetFlushRecPtr(TimeLineID *insertTLI)
    6996              : {
    6997              :     Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
    6998              : 
    6999       215850 :     RefreshXLogWriteResult(LogwrtResult);
    7000              : 
    7001              :     /*
    7002              :      * If we're writing and flushing WAL, the time line can't be changing, so
    7003              :      * no lock is required.
    7004              :      */
    7005       215850 :     if (insertTLI)
    7006        24561 :         *insertTLI = XLogCtl->InsertTimeLineID;
    7007              : 
    7008       215850 :     return LogwrtResult.Flush;
    7009              : }
    7010              : 
    7011              : /*
    7012              :  * GetWALInsertionTimeLine -- Returns the current timeline of a system that
    7013              :  * is not in recovery.
    7014              :  */
    7015              : TimeLineID
    7016       118516 : GetWALInsertionTimeLine(void)
    7017              : {
    7018              :     Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
    7019              : 
    7020              :     /* Since the value can't be changing, no lock is required. */
    7021       118516 :     return XLogCtl->InsertTimeLineID;
    7022              : }
    7023              : 
    7024              : /*
    7025              :  * GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns
    7026              :  * the WAL insertion timeline; else, returns 0. Wherever possible, use
    7027              :  * GetWALInsertionTimeLine() instead, since it's cheaper. Note that this
    7028              :  * function decides recovery has ended as soon as the insert TLI is set, which
    7029              :  * happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE.
    7030              :  */
    7031              : TimeLineID
    7032            1 : GetWALInsertionTimeLineIfSet(void)
    7033              : {
    7034              :     TimeLineID  insertTLI;
    7035              : 
    7036            1 :     SpinLockAcquire(&XLogCtl->info_lck);
    7037            1 :     insertTLI = XLogCtl->InsertTimeLineID;
    7038            1 :     SpinLockRelease(&XLogCtl->info_lck);
    7039              : 
    7040            1 :     return insertTLI;
    7041              : }
    7042              : 
    7043              : /*
    7044              :  * GetLastImportantRecPtr -- Returns the LSN of the last important record
    7045              :  * inserted. All records not explicitly marked as unimportant are considered
    7046              :  * important.
    7047              :  *
    7048              :  * The LSN is determined by computing the maximum of
    7049              :  * WALInsertLocks[i].lastImportantAt.
    7050              :  */
    7051              : XLogRecPtr
    7052         1774 : GetLastImportantRecPtr(void)
    7053              : {
    7054         1774 :     XLogRecPtr  res = InvalidXLogRecPtr;
    7055              :     int         i;
    7056              : 
    7057        15966 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    7058              :     {
    7059              :         XLogRecPtr  last_important;
    7060              : 
    7061              :         /*
    7062              :          * Need to take a lock to prevent torn reads of the LSN, which are
    7063              :          * possible on some of the supported platforms. WAL insert locks only
    7064              :          * support exclusive mode, so we have to use that.
    7065              :          */
    7066        14192 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    7067        14192 :         last_important = WALInsertLocks[i].l.lastImportantAt;
    7068        14192 :         LWLockRelease(&WALInsertLocks[i].l.lock);
    7069              : 
    7070        14192 :         if (res < last_important)
    7071         2994 :             res = last_important;
    7072              :     }
    7073              : 
    7074         1774 :     return res;
    7075              : }
    7076              : 
    7077              : /*
    7078              :  * Get the time and LSN of the last xlog segment switch
    7079              :  */
    7080              : pg_time_t
    7081            0 : GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
    7082              : {
    7083              :     pg_time_t   result;
    7084              : 
    7085              :     /* Need WALWriteLock, but shared lock is sufficient */
    7086            0 :     LWLockAcquire(WALWriteLock, LW_SHARED);
    7087            0 :     result = XLogCtl->lastSegSwitchTime;
    7088            0 :     *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
    7089            0 :     LWLockRelease(WALWriteLock);
    7090              : 
    7091            0 :     return result;
    7092              : }
    7093              : 
    7094              : /*
    7095              :  * This must be called ONCE during postmaster or standalone-backend shutdown
    7096              :  */
    7097              : void
    7098          759 : ShutdownXLOG(int code, Datum arg)
    7099              : {
    7100              :     /*
    7101              :      * We should have an aux process resource owner to use, and we should not
    7102              :      * be in a transaction that's installed some other resowner.
    7103              :      */
    7104              :     Assert(AuxProcessResourceOwner != NULL);
    7105              :     Assert(CurrentResourceOwner == NULL ||
    7106              :            CurrentResourceOwner == AuxProcessResourceOwner);
    7107          759 :     CurrentResourceOwner = AuxProcessResourceOwner;
    7108              : 
    7109              :     /* Don't be chatty in standalone mode */
    7110          759 :     ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    7111              :             (errmsg("shutting down")));
    7112              : 
    7113              :     /*
    7114              :      * Signal walsenders to move to stopping state.
    7115              :      */
    7116          759 :     WalSndInitStopping();
    7117              : 
    7118              :     /*
    7119              :      * Wait for WAL senders to be in stopping state.  This prevents commands
    7120              :      * from writing new WAL.
    7121              :      */
    7122          759 :     WalSndWaitStopping();
    7123              : 
    7124          759 :     if (RecoveryInProgress())
    7125           60 :         CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST);
    7126              :     else
    7127              :     {
    7128              :         /*
    7129              :          * If archiving is enabled, rotate the last XLOG file so that all the
    7130              :          * remaining records are archived (postmaster wakes up the archiver
    7131              :          * process one more time at the end of shutdown). The checkpoint
    7132              :          * record will go to the next XLOG file and won't be archived (yet).
    7133              :          */
    7134          699 :         if (XLogArchivingActive())
    7135           16 :             RequestXLogSwitch(false);
    7136              : 
    7137          699 :         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST);
    7138              :     }
    7139          759 : }
    7140              : 
    7141              : /*
    7142              :  * Format checkpoint request flags as a space-separated string for
    7143              :  * log messages.
    7144              :  */
    7145              : static const char *
    7146         3194 : CheckpointFlagsString(int flags)
    7147              : {
    7148              :     static char buf[128];
    7149              : 
    7150        25552 :     snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s",
    7151         3194 :              (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
    7152         3194 :              (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
    7153         3194 :              (flags & CHECKPOINT_FAST) ? " fast" : "",
    7154         3194 :              (flags & CHECKPOINT_FORCE) ? " force" : "",
    7155         3194 :              (flags & CHECKPOINT_WAIT) ? " wait" : "",
    7156         3194 :              (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
    7157         3194 :              (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
    7158         3194 :              (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : "");
    7159              : 
    7160         3194 :     return buf;
    7161              : }
    7162              : 
    7163              : /*
    7164              :  * Log start of a checkpoint.
    7165              :  */
    7166              : static void
    7167         1597 : LogCheckpointStart(int flags, bool restartpoint)
    7168              : {
    7169         1597 :     if (restartpoint)
    7170          210 :         ereport(LOG,
    7171              :         /* translator: the placeholder shows checkpoint options */
    7172              :                 (errmsg("restartpoint starting:%s",
    7173              :                         CheckpointFlagsString(flags))));
    7174              :     else
    7175         1387 :         ereport(LOG,
    7176              :         /* translator: the placeholder shows checkpoint options */
    7177              :                 (errmsg("checkpoint starting:%s",
    7178              :                         CheckpointFlagsString(flags))));
    7179         1597 : }
    7180              : 
    7181              : /*
    7182              :  * Log end of a checkpoint.
    7183              :  */
    7184              : static void
    7185         1931 : LogCheckpointEnd(bool restartpoint, int flags)
    7186              : {
    7187              :     long        write_msecs,
    7188              :                 sync_msecs,
    7189              :                 total_msecs,
    7190              :                 longest_msecs,
    7191              :                 average_msecs;
    7192              :     uint64      average_sync_time;
    7193              : 
    7194         1931 :     CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
    7195              : 
    7196         1931 :     write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
    7197              :                                                   CheckpointStats.ckpt_sync_t);
    7198              : 
    7199         1931 :     sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
    7200              :                                                  CheckpointStats.ckpt_sync_end_t);
    7201              : 
    7202              :     /* Accumulate checkpoint timing summary data, in milliseconds. */
    7203         1931 :     PendingCheckpointerStats.write_time += write_msecs;
    7204         1931 :     PendingCheckpointerStats.sync_time += sync_msecs;
    7205              : 
    7206              :     /*
    7207              :      * All of the published timing statistics are accounted for.  Only
    7208              :      * continue if a log message is to be written.
    7209              :      */
    7210         1931 :     if (!log_checkpoints)
    7211          334 :         return;
    7212              : 
    7213         1597 :     total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
    7214              :                                                   CheckpointStats.ckpt_end_t);
    7215              : 
    7216              :     /*
    7217              :      * Timing values returned from CheckpointStats are in microseconds.
    7218              :      * Convert to milliseconds for consistent printing.
    7219              :      */
    7220         1597 :     longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
    7221              : 
    7222         1597 :     average_sync_time = 0;
    7223         1597 :     if (CheckpointStats.ckpt_sync_rels > 0)
    7224            0 :         average_sync_time = CheckpointStats.ckpt_agg_sync_time /
    7225            0 :             CheckpointStats.ckpt_sync_rels;
    7226         1597 :     average_msecs = (long) ((average_sync_time + 999) / 1000);
    7227              : 
    7228              :     /*
    7229              :      * ControlFileLock is not required to see ControlFile->checkPoint and
    7230              :      * ->checkPointCopy here as we are the only updator of those variables at
    7231              :      * this moment.
    7232              :      */
    7233         1597 :     if (restartpoint)
    7234          210 :         ereport(LOG,
    7235              :                 (errmsg("restartpoint complete:%s: wrote %d buffers (%.1f%%), "
    7236              :                         "wrote %d SLRU buffers; %d WAL file(s) added, "
    7237              :                         "%d removed, %d recycled; write=%ld.%03d s, "
    7238              :                         "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
    7239              :                         "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
    7240              :                         "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X",
    7241              :                         CheckpointFlagsString(flags),
    7242              :                         CheckpointStats.ckpt_bufs_written,
    7243              :                         (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
    7244              :                         CheckpointStats.ckpt_slru_written,
    7245              :                         CheckpointStats.ckpt_segs_added,
    7246              :                         CheckpointStats.ckpt_segs_removed,
    7247              :                         CheckpointStats.ckpt_segs_recycled,
    7248              :                         write_msecs / 1000, (int) (write_msecs % 1000),
    7249              :                         sync_msecs / 1000, (int) (sync_msecs % 1000),
    7250              :                         total_msecs / 1000, (int) (total_msecs % 1000),
    7251              :                         CheckpointStats.ckpt_sync_rels,
    7252              :                         longest_msecs / 1000, (int) (longest_msecs % 1000),
    7253              :                         average_msecs / 1000, (int) (average_msecs % 1000),
    7254              :                         (int) (PrevCheckPointDistance / 1024.0),
    7255              :                         (int) (CheckPointDistanceEstimate / 1024.0),
    7256              :                         LSN_FORMAT_ARGS(ControlFile->checkPoint),
    7257              :                         LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
    7258              :     else
    7259         1387 :         ereport(LOG,
    7260              :                 (errmsg("checkpoint complete:%s: wrote %d buffers (%.1f%%), "
    7261              :                         "wrote %d SLRU buffers; %d WAL file(s) added, "
    7262              :                         "%d removed, %d recycled; write=%ld.%03d s, "
    7263              :                         "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
    7264              :                         "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
    7265              :                         "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X",
    7266              :                         CheckpointFlagsString(flags),
    7267              :                         CheckpointStats.ckpt_bufs_written,
    7268              :                         (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
    7269              :                         CheckpointStats.ckpt_slru_written,
    7270              :                         CheckpointStats.ckpt_segs_added,
    7271              :                         CheckpointStats.ckpt_segs_removed,
    7272              :                         CheckpointStats.ckpt_segs_recycled,
    7273              :                         write_msecs / 1000, (int) (write_msecs % 1000),
    7274              :                         sync_msecs / 1000, (int) (sync_msecs % 1000),
    7275              :                         total_msecs / 1000, (int) (total_msecs % 1000),
    7276              :                         CheckpointStats.ckpt_sync_rels,
    7277              :                         longest_msecs / 1000, (int) (longest_msecs % 1000),
    7278              :                         average_msecs / 1000, (int) (average_msecs % 1000),
    7279              :                         (int) (PrevCheckPointDistance / 1024.0),
    7280              :                         (int) (CheckPointDistanceEstimate / 1024.0),
    7281              :                         LSN_FORMAT_ARGS(ControlFile->checkPoint),
    7282              :                         LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
    7283              : }
    7284              : 
    7285              : /*
    7286              :  * Update the estimate of distance between checkpoints.
    7287              :  *
    7288              :  * The estimate is used to calculate the number of WAL segments to keep
    7289              :  * preallocated, see XLOGfileslop().
    7290              :  */
    7291              : static void
    7292         1931 : UpdateCheckPointDistanceEstimate(uint64 nbytes)
    7293              : {
    7294              :     /*
    7295              :      * To estimate the number of segments consumed between checkpoints, keep a
    7296              :      * moving average of the amount of WAL generated in previous checkpoint
    7297              :      * cycles. However, if the load is bursty, with quiet periods and busy
    7298              :      * periods, we want to cater for the peak load. So instead of a plain
    7299              :      * moving average, let the average decline slowly if the previous cycle
    7300              :      * used less WAL than estimated, but bump it up immediately if it used
    7301              :      * more.
    7302              :      *
    7303              :      * When checkpoints are triggered by max_wal_size, this should converge to
    7304              :      * CheckpointSegments * wal_segment_size,
    7305              :      *
    7306              :      * Note: This doesn't pay any attention to what caused the checkpoint.
    7307              :      * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
    7308              :      * starting a base backup, are counted the same as those created
    7309              :      * automatically. The slow-decline will largely mask them out, if they are
    7310              :      * not frequent. If they are frequent, it seems reasonable to count them
    7311              :      * in as any others; if you issue a manual checkpoint every 5 minutes and
    7312              :      * never let a timed checkpoint happen, it makes sense to base the
    7313              :      * preallocation on that 5 minute interval rather than whatever
    7314              :      * checkpoint_timeout is set to.
    7315              :      */
    7316         1931 :     PrevCheckPointDistance = nbytes;
    7317         1931 :     if (CheckPointDistanceEstimate < nbytes)
    7318          855 :         CheckPointDistanceEstimate = nbytes;
    7319              :     else
    7320         1076 :         CheckPointDistanceEstimate =
    7321         1076 :             (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
    7322         1931 : }
    7323              : 
    7324              : /*
    7325              :  * Update the ps display for a process running a checkpoint.  Note that
    7326              :  * this routine should not do any allocations so as it can be called
    7327              :  * from a critical section.
    7328              :  */
    7329              : static void
    7330         3862 : update_checkpoint_display(int flags, bool restartpoint, bool reset)
    7331              : {
    7332              :     /*
    7333              :      * The status is reported only for end-of-recovery and shutdown
    7334              :      * checkpoints or shutdown restartpoints.  Updating the ps display is
    7335              :      * useful in those situations as it may not be possible to rely on
    7336              :      * pg_stat_activity to see the status of the checkpointer or the startup
    7337              :      * process.
    7338              :      */
    7339         3862 :     if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
    7340         2356 :         return;
    7341              : 
    7342         1506 :     if (reset)
    7343          753 :         set_ps_display("");
    7344              :     else
    7345              :     {
    7346              :         char        activitymsg[128];
    7347              : 
    7348         2259 :         snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
    7349          753 :                  (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
    7350          753 :                  (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
    7351              :                  restartpoint ? "restartpoint" : "checkpoint");
    7352          753 :         set_ps_display(activitymsg);
    7353              :     }
    7354              : }
    7355              : 
    7356              : 
    7357              : /*
    7358              :  * Perform a checkpoint --- either during shutdown, or on-the-fly
    7359              :  *
    7360              :  * flags is a bitwise OR of the following:
    7361              :  *  CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
    7362              :  *  CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
    7363              :  *  CHECKPOINT_FAST: finish the checkpoint ASAP, ignoring
    7364              :  *      checkpoint_completion_target parameter.
    7365              :  *  CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
    7366              :  *      since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
    7367              :  *      CHECKPOINT_END_OF_RECOVERY).
    7368              :  *  CHECKPOINT_FLUSH_UNLOGGED: also flush buffers of unlogged tables.
    7369              :  *
    7370              :  * Note: flags contains other bits, of interest here only for logging purposes.
    7371              :  * In particular note that this routine is synchronous and does not pay
    7372              :  * attention to CHECKPOINT_WAIT.
    7373              :  *
    7374              :  * If !shutdown then we are writing an online checkpoint. An XLOG_CHECKPOINT_REDO
    7375              :  * record is inserted into WAL at the logical location of the checkpoint, before
    7376              :  * flushing anything to disk, and when the checkpoint is eventually completed,
    7377              :  * and it is from this point that WAL replay will begin in the case of a recovery
    7378              :  * from this checkpoint. Once everything is written to disk, an
    7379              :  * XLOG_CHECKPOINT_ONLINE record is written to complete the checkpoint, and
    7380              :  * points back to the earlier XLOG_CHECKPOINT_REDO record. This mechanism allows
    7381              :  * other write-ahead log records to be written while the checkpoint is in
    7382              :  * progress, but we must be very careful about order of operations. This function
    7383              :  * may take many minutes to execute on a busy system.
    7384              :  *
    7385              :  * On the other hand, when shutdown is true, concurrent insertion into the
    7386              :  * write-ahead log is impossible, so there is no need for two separate records.
    7387              :  * In this case, we only insert an XLOG_CHECKPOINT_SHUTDOWN record, and it's
    7388              :  * both the record marking the completion of the checkpoint and the location
    7389              :  * from which WAL replay would begin if needed.
    7390              :  *
    7391              :  * Returns true if a new checkpoint was performed, or false if it was skipped
    7392              :  * because the system was idle.
    7393              :  */
    7394              : bool
    7395         1725 : CreateCheckPoint(int flags)
    7396              : {
    7397              :     bool        shutdown;
    7398              :     CheckPoint  checkPoint;
    7399              :     XLogRecPtr  recptr;
    7400              :     XLogSegNo   _logSegNo;
    7401         1725 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    7402              :     uint32      freespace;
    7403              :     XLogRecPtr  PriorRedoPtr;
    7404              :     XLogRecPtr  last_important_lsn;
    7405              :     VirtualTransactionId *vxids;
    7406              :     int         nvxids;
    7407         1725 :     int         oldXLogAllowed = 0;
    7408              : 
    7409              :     /*
    7410              :      * An end-of-recovery checkpoint is really a shutdown checkpoint, just
    7411              :      * issued at a different time.
    7412              :      */
    7413         1725 :     if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
    7414          729 :         shutdown = true;
    7415              :     else
    7416          996 :         shutdown = false;
    7417              : 
    7418              :     /* sanity check */
    7419         1725 :     if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
    7420            0 :         elog(ERROR, "can't create a checkpoint during recovery");
    7421              : 
    7422              :     /*
    7423              :      * Prepare to accumulate statistics.
    7424              :      *
    7425              :      * Note: because it is possible for log_checkpoints to change while a
    7426              :      * checkpoint proceeds, we always accumulate stats, even if
    7427              :      * log_checkpoints is currently off.
    7428              :      */
    7429        18975 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    7430         1725 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    7431              : 
    7432              :     /*
    7433              :      * Let smgr prepare for checkpoint; this has to happen outside the
    7434              :      * critical section and before we determine the REDO pointer.  Note that
    7435              :      * smgr must not do anything that'd have to be undone if we decide no
    7436              :      * checkpoint is needed.
    7437              :      */
    7438         1725 :     SyncPreCheckpoint();
    7439              : 
    7440              :     /* Run these points outside the critical section. */
    7441         1725 :     INJECTION_POINT("create-checkpoint-initial", NULL);
    7442         1725 :     INJECTION_POINT_LOAD("create-checkpoint-run");
    7443              : 
    7444              :     /*
    7445              :      * Use a critical section to force system panic if we have trouble.
    7446              :      */
    7447         1725 :     START_CRIT_SECTION();
    7448              : 
    7449         1725 :     if (shutdown)
    7450              :     {
    7451          729 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7452          729 :         ControlFile->state = DB_SHUTDOWNING;
    7453          729 :         UpdateControlFile();
    7454          729 :         LWLockRelease(ControlFileLock);
    7455              :     }
    7456              : 
    7457              :     /* Begin filling in the checkpoint WAL record */
    7458        22425 :     MemSet(&checkPoint, 0, sizeof(checkPoint));
    7459         1725 :     checkPoint.time = (pg_time_t) time(NULL);
    7460              : 
    7461              :     /*
    7462              :      * For Hot Standby, derive the oldestActiveXid before we fix the redo
    7463              :      * pointer. This allows us to begin accumulating changes to assemble our
    7464              :      * starting snapshot of locks and transactions.
    7465              :      */
    7466         1725 :     if (!shutdown && XLogStandbyInfoActive())
    7467          956 :         checkPoint.oldestActiveXid = GetOldestActiveTransactionId(false, true);
    7468              :     else
    7469          769 :         checkPoint.oldestActiveXid = InvalidTransactionId;
    7470              : 
    7471              :     /*
    7472              :      * Get location of last important record before acquiring insert locks (as
    7473              :      * GetLastImportantRecPtr() also locks WAL locks).
    7474              :      */
    7475         1725 :     last_important_lsn = GetLastImportantRecPtr();
    7476              : 
    7477              :     /*
    7478              :      * If this isn't a shutdown or forced checkpoint, and if there has been no
    7479              :      * WAL activity requiring a checkpoint, skip it.  The idea here is to
    7480              :      * avoid inserting duplicate checkpoints when the system is idle.
    7481              :      */
    7482         1725 :     if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
    7483              :                   CHECKPOINT_FORCE)) == 0)
    7484              :     {
    7485          207 :         if (last_important_lsn == ControlFile->checkPoint)
    7486              :         {
    7487            4 :             END_CRIT_SECTION();
    7488            4 :             ereport(DEBUG1,
    7489              :                     (errmsg_internal("checkpoint skipped because system is idle")));
    7490            4 :             return false;
    7491              :         }
    7492              :     }
    7493              : 
    7494              :     /*
    7495              :      * An end-of-recovery checkpoint is created before anyone is allowed to
    7496              :      * write WAL. To allow us to write the checkpoint record, temporarily
    7497              :      * enable XLogInsertAllowed.
    7498              :      */
    7499         1721 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    7500           30 :         oldXLogAllowed = LocalSetXLogInsertAllowed();
    7501              : 
    7502         1721 :     checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
    7503         1721 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    7504           30 :         checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    7505              :     else
    7506         1691 :         checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
    7507              : 
    7508              :     /*
    7509              :      * We must block concurrent insertions while examining insert state.
    7510              :      */
    7511         1721 :     WALInsertLockAcquireExclusive();
    7512              : 
    7513         1721 :     checkPoint.fullPageWrites = Insert->fullPageWrites;
    7514         1721 :     checkPoint.wal_level = wal_level;
    7515              : 
    7516              :     /*
    7517              :      * Get the current data_checksum_version value from xlogctl, valid at the
    7518              :      * time of the checkpoint.
    7519              :      */
    7520         1721 :     checkPoint.dataChecksumState = XLogCtl->data_checksum_version;
    7521              : 
    7522         1721 :     if (shutdown)
    7523              :     {
    7524          729 :         XLogRecPtr  curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
    7525              : 
    7526              :         /*
    7527              :          * Compute new REDO record ptr = location of next XLOG record.
    7528              :          *
    7529              :          * Since this is a shutdown checkpoint, there can't be any concurrent
    7530              :          * WAL insertion.
    7531              :          */
    7532          729 :         freespace = INSERT_FREESPACE(curInsert);
    7533          729 :         if (freespace == 0)
    7534              :         {
    7535            0 :             if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
    7536            0 :                 curInsert += SizeOfXLogLongPHD;
    7537              :             else
    7538            0 :                 curInsert += SizeOfXLogShortPHD;
    7539              :         }
    7540          729 :         checkPoint.redo = curInsert;
    7541              : 
    7542              :         /*
    7543              :          * Here we update the shared RedoRecPtr for future XLogInsert calls;
    7544              :          * this must be done while holding all the insertion locks.
    7545              :          *
    7546              :          * Note: if we fail to complete the checkpoint, RedoRecPtr will be
    7547              :          * left pointing past where it really needs to point.  This is okay;
    7548              :          * the only consequence is that XLogInsert might back up whole buffers
    7549              :          * that it didn't really need to.  We can't postpone advancing
    7550              :          * RedoRecPtr because XLogInserts that happen while we are dumping
    7551              :          * buffers must assume that their buffer changes are not included in
    7552              :          * the checkpoint.
    7553              :          */
    7554          729 :         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    7555              :     }
    7556              : 
    7557              :     /*
    7558              :      * Now we can release the WAL insertion locks, allowing other xacts to
    7559              :      * proceed while we are flushing disk buffers.
    7560              :      */
    7561         1721 :     WALInsertLockRelease();
    7562              : 
    7563              :     /*
    7564              :      * If this is an online checkpoint, we have not yet determined the redo
    7565              :      * point. We do so now by inserting the special XLOG_CHECKPOINT_REDO
    7566              :      * record; the LSN at which it starts becomes the new redo pointer. We
    7567              :      * don't do this for a shutdown checkpoint, because in that case no WAL
    7568              :      * can be written between the redo point and the insertion of the
    7569              :      * checkpoint record itself, so the checkpoint record itself serves to
    7570              :      * mark the redo point.
    7571              :      */
    7572         1721 :     if (!shutdown)
    7573              :     {
    7574              :         xl_checkpoint_redo redo_rec;
    7575              : 
    7576          992 :         WALInsertLockAcquire();
    7577          992 :         redo_rec.wal_level = wal_level;
    7578          992 :         SpinLockAcquire(&XLogCtl->info_lck);
    7579          992 :         redo_rec.data_checksum_version = XLogCtl->data_checksum_version;
    7580          992 :         SpinLockRelease(&XLogCtl->info_lck);
    7581          992 :         WALInsertLockRelease();
    7582              : 
    7583              :         /* Include WAL level in record for WAL summarizer's benefit. */
    7584          992 :         XLogBeginInsert();
    7585          992 :         XLogRegisterData(&redo_rec, sizeof(xl_checkpoint_redo));
    7586          992 :         (void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
    7587              : 
    7588              :         /*
    7589              :          * XLogInsertRecord will have updated XLogCtl->Insert.RedoRecPtr in
    7590              :          * shared memory and RedoRecPtr in backend-local memory, but we need
    7591              :          * to copy that into the record that will be inserted when the
    7592              :          * checkpoint is complete.
    7593              :          */
    7594          992 :         checkPoint.redo = RedoRecPtr;
    7595              :     }
    7596              : 
    7597              :     /* Update the info_lck-protected copy of RedoRecPtr as well */
    7598         1721 :     SpinLockAcquire(&XLogCtl->info_lck);
    7599         1721 :     XLogCtl->RedoRecPtr = checkPoint.redo;
    7600         1721 :     SpinLockRelease(&XLogCtl->info_lck);
    7601              : 
    7602              :     /*
    7603              :      * If enabled, log checkpoint start.  We postpone this until now so as not
    7604              :      * to log anything if we decided to skip the checkpoint.
    7605              :      */
    7606         1721 :     if (log_checkpoints)
    7607         1387 :         LogCheckpointStart(flags, false);
    7608              : 
    7609         1721 :     INJECTION_POINT_CACHED("create-checkpoint-run", NULL);
    7610              : 
    7611              :     /* Update the process title */
    7612         1721 :     update_checkpoint_display(flags, false, false);
    7613              : 
    7614              :     TRACE_POSTGRESQL_CHECKPOINT_START(flags);
    7615              : 
    7616              :     /*
    7617              :      * Get the other info we need for the checkpoint record.
    7618              :      *
    7619              :      * We don't need to save oldestClogXid in the checkpoint, it only matters
    7620              :      * for the short period in which clog is being truncated, and if we crash
    7621              :      * during that we'll redo the clog truncation and fix up oldestClogXid
    7622              :      * there.
    7623              :      */
    7624         1721 :     LWLockAcquire(XidGenLock, LW_SHARED);
    7625         1721 :     checkPoint.nextXid = TransamVariables->nextXid;
    7626         1721 :     checkPoint.oldestXid = TransamVariables->oldestXid;
    7627         1721 :     checkPoint.oldestXidDB = TransamVariables->oldestXidDB;
    7628         1721 :     LWLockRelease(XidGenLock);
    7629              : 
    7630         1721 :     LWLockAcquire(CommitTsLock, LW_SHARED);
    7631         1721 :     checkPoint.oldestCommitTsXid = TransamVariables->oldestCommitTsXid;
    7632         1721 :     checkPoint.newestCommitTsXid = TransamVariables->newestCommitTsXid;
    7633         1721 :     LWLockRelease(CommitTsLock);
    7634              : 
    7635         1721 :     LWLockAcquire(OidGenLock, LW_SHARED);
    7636         1721 :     checkPoint.nextOid = TransamVariables->nextOid;
    7637         1721 :     if (!shutdown)
    7638          992 :         checkPoint.nextOid += TransamVariables->oidCount;
    7639         1721 :     LWLockRelease(OidGenLock);
    7640              : 
    7641         1721 :     SpinLockAcquire(&XLogCtl->info_lck);
    7642         1721 :     checkPoint.dataChecksumState = XLogCtl->data_checksum_version;
    7643         1721 :     SpinLockRelease(&XLogCtl->info_lck);
    7644              : 
    7645         1721 :     checkPoint.logicalDecodingEnabled = IsLogicalDecodingEnabled();
    7646              : 
    7647         1721 :     MultiXactGetCheckptMulti(shutdown,
    7648              :                              &checkPoint.nextMulti,
    7649              :                              &checkPoint.nextMultiOffset,
    7650              :                              &checkPoint.oldestMulti,
    7651              :                              &checkPoint.oldestMultiDB);
    7652              : 
    7653              :     /*
    7654              :      * Having constructed the checkpoint record, ensure all shmem disk buffers
    7655              :      * and commit-log buffers are flushed to disk.
    7656              :      *
    7657              :      * This I/O could fail for various reasons.  If so, we will fail to
    7658              :      * complete the checkpoint, but there is no reason to force a system
    7659              :      * panic. Accordingly, exit critical section while doing it.
    7660              :      */
    7661         1721 :     END_CRIT_SECTION();
    7662              : 
    7663              :     /*
    7664              :      * In some cases there are groups of actions that must all occur on one
    7665              :      * side or the other of a checkpoint record. Before flushing the
    7666              :      * checkpoint record we must explicitly wait for any backend currently
    7667              :      * performing those groups of actions.
    7668              :      *
    7669              :      * One example is end of transaction, so we must wait for any transactions
    7670              :      * that are currently in commit critical sections.  If an xact inserted
    7671              :      * its commit record into XLOG just before the REDO point, then a crash
    7672              :      * restart from the REDO point would not replay that record, which means
    7673              :      * that our flushing had better include the xact's update of pg_xact.  So
    7674              :      * we wait till he's out of his commit critical section before proceeding.
    7675              :      * See notes in RecordTransactionCommit().
    7676              :      *
    7677              :      * Because we've already released the insertion locks, this test is a bit
    7678              :      * fuzzy: it is possible that we will wait for xacts we didn't really need
    7679              :      * to wait for.  But the delay should be short and it seems better to make
    7680              :      * checkpoint take a bit longer than to hold off insertions longer than
    7681              :      * necessary. (In fact, the whole reason we have this issue is that xact.c
    7682              :      * does commit record XLOG insertion and clog update as two separate steps
    7683              :      * protected by different locks, but again that seems best on grounds of
    7684              :      * minimizing lock contention.)
    7685              :      *
    7686              :      * A transaction that has not yet set delayChkptFlags when we look cannot
    7687              :      * be at risk, since it has not inserted its commit record yet; and one
    7688              :      * that's already cleared it is not at risk either, since it's done fixing
    7689              :      * clog and we will correctly flush the update below.  So we cannot miss
    7690              :      * any xacts we need to wait for.
    7691              :      */
    7692         1721 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
    7693         1721 :     if (nvxids > 0)
    7694              :     {
    7695              :         do
    7696              :         {
    7697              :             /*
    7698              :              * Keep absorbing fsync requests while we wait. There could even
    7699              :              * be a deadlock if we don't, if the process that prevents the
    7700              :              * checkpoint is trying to add a request to the queue.
    7701              :              */
    7702           26 :             AbsorbSyncRequests();
    7703              : 
    7704           26 :             pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_START);
    7705           26 :             pg_usleep(10000L);  /* wait for 10 msec */
    7706           26 :             pgstat_report_wait_end();
    7707           26 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
    7708              :                                               DELAY_CHKPT_START));
    7709              :     }
    7710         1721 :     pfree(vxids);
    7711              : 
    7712         1721 :     CheckPointGuts(checkPoint.redo, flags);
    7713              : 
    7714         1721 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
    7715         1721 :     if (nvxids > 0)
    7716              :     {
    7717              :         do
    7718              :         {
    7719            0 :             AbsorbSyncRequests();
    7720              : 
    7721            0 :             pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_COMPLETE);
    7722            0 :             pg_usleep(10000L);  /* wait for 10 msec */
    7723            0 :             pgstat_report_wait_end();
    7724            0 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
    7725              :                                               DELAY_CHKPT_COMPLETE));
    7726              :     }
    7727         1721 :     pfree(vxids);
    7728              : 
    7729              :     /*
    7730              :      * Take a snapshot of running transactions and write this to WAL. This
    7731              :      * allows us to reconstruct the state of running transactions during
    7732              :      * archive recovery, if required. Skip, if this info disabled.
    7733              :      *
    7734              :      * If we are shutting down, or Startup process is completing crash
    7735              :      * recovery we don't need to write running xact data.
    7736              :      */
    7737         1721 :     if (!shutdown && XLogStandbyInfoActive())
    7738          952 :         LogStandbySnapshot(InvalidOid);
    7739              : 
    7740         1721 :     START_CRIT_SECTION();
    7741              : 
    7742              :     /*
    7743              :      * Now insert the checkpoint record into XLOG.
    7744              :      */
    7745         1721 :     XLogBeginInsert();
    7746         1721 :     XLogRegisterData(&checkPoint, sizeof(checkPoint));
    7747         1721 :     recptr = XLogInsert(RM_XLOG_ID,
    7748              :                         shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
    7749              :                         XLOG_CHECKPOINT_ONLINE);
    7750              : 
    7751         1721 :     XLogFlush(recptr);
    7752              : 
    7753              :     /*
    7754              :      * We mustn't write any new WAL after a shutdown checkpoint, or it will be
    7755              :      * overwritten at next startup.  No-one should even try, this just allows
    7756              :      * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
    7757              :      * to just temporarily disable writing until the system has exited
    7758              :      * recovery.
    7759              :      */
    7760         1721 :     if (shutdown)
    7761              :     {
    7762          729 :         if (flags & CHECKPOINT_END_OF_RECOVERY)
    7763           30 :             LocalXLogInsertAllowed = oldXLogAllowed;
    7764              :         else
    7765          699 :             LocalXLogInsertAllowed = 0; /* never again write WAL */
    7766              :     }
    7767              : 
    7768              :     /*
    7769              :      * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
    7770              :      * = end of actual checkpoint record.
    7771              :      */
    7772         1721 :     if (shutdown && checkPoint.redo != ProcLastRecPtr)
    7773            0 :         ereport(PANIC,
    7774              :                 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
    7775              : 
    7776              :     /*
    7777              :      * Remember the prior checkpoint's redo ptr for
    7778              :      * UpdateCheckPointDistanceEstimate()
    7779              :      */
    7780         1721 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    7781              : 
    7782              :     /*
    7783              :      * Update the control file.
    7784              :      */
    7785         1721 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7786         1721 :     if (shutdown)
    7787          729 :         ControlFile->state = DB_SHUTDOWNED;
    7788         1721 :     ControlFile->checkPoint = ProcLastRecPtr;
    7789         1721 :     ControlFile->checkPointCopy = checkPoint;
    7790              :     /* crash recovery should always recover to the end of WAL */
    7791         1721 :     ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
    7792         1721 :     ControlFile->minRecoveryPointTLI = 0;
    7793              : 
    7794              :     /* make sure we start with the checksum version as of the checkpoint */
    7795         1721 :     ControlFile->data_checksum_version = checkPoint.dataChecksumState;
    7796              : 
    7797              :     /*
    7798              :      * Persist unloggedLSN value. It's reset on crash recovery, so this goes
    7799              :      * unused on non-shutdown checkpoints, but seems useful to store it always
    7800              :      * for debugging purposes.
    7801              :      */
    7802         1721 :     ControlFile->unloggedLSN = pg_atomic_read_membarrier_u64(&XLogCtl->unloggedLSN);
    7803              : 
    7804         1721 :     UpdateControlFile();
    7805         1721 :     LWLockRelease(ControlFileLock);
    7806              : 
    7807              :     /*
    7808              :      * We are now done with critical updates; no need for system panic if we
    7809              :      * have trouble while fooling with old log segments.
    7810              :      */
    7811         1721 :     END_CRIT_SECTION();
    7812              : 
    7813              :     /*
    7814              :      * WAL summaries end when the next XLOG_CHECKPOINT_REDO or
    7815              :      * XLOG_CHECKPOINT_SHUTDOWN record is reached. This is the first point
    7816              :      * where (a) we're not inside of a critical section and (b) we can be
    7817              :      * certain that the relevant record has been flushed to disk, which must
    7818              :      * happen before it can be summarized.
    7819              :      *
    7820              :      * If this is a shutdown checkpoint, then this happens reasonably
    7821              :      * promptly: we've only just inserted and flushed the
    7822              :      * XLOG_CHECKPOINT_SHUTDOWN record. If this is not a shutdown checkpoint,
    7823              :      * then this might not be very prompt at all: the XLOG_CHECKPOINT_REDO
    7824              :      * record was written before we began flushing data to disk, and that
    7825              :      * could be many minutes ago at this point. However, we don't XLogFlush()
    7826              :      * after inserting that record, so we're not guaranteed that it's on disk
    7827              :      * until after the above call that flushes the XLOG_CHECKPOINT_ONLINE
    7828              :      * record.
    7829              :      */
    7830         1721 :     WakeupWalSummarizer();
    7831              : 
    7832              :     /*
    7833              :      * Let smgr do post-checkpoint cleanup (eg, deleting old files).
    7834              :      */
    7835         1721 :     SyncPostCheckpoint();
    7836              : 
    7837              :     /*
    7838              :      * Update the average distance between checkpoints if the prior checkpoint
    7839              :      * exists.
    7840              :      */
    7841         1721 :     if (XLogRecPtrIsValid(PriorRedoPtr))
    7842         1721 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    7843              : 
    7844         1721 :     INJECTION_POINT("checkpoint-before-old-wal-removal", NULL);
    7845              : 
    7846              :     /*
    7847              :      * Delete old log files, those no longer needed for last checkpoint to
    7848              :      * prevent the disk holding the xlog from growing full.
    7849              :      */
    7850         1721 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    7851         1721 :     KeepLogSeg(recptr, &_logSegNo);
    7852         1721 :     if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED | RS_INVAL_IDLE_TIMEOUT,
    7853              :                                            _logSegNo, InvalidOid,
    7854              :                                            InvalidTransactionId))
    7855              :     {
    7856              :         /*
    7857              :          * Some slots have been invalidated; recalculate the old-segment
    7858              :          * horizon, starting again from RedoRecPtr.
    7859              :          */
    7860            4 :         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    7861            4 :         KeepLogSeg(recptr, &_logSegNo);
    7862              :     }
    7863         1721 :     _logSegNo--;
    7864         1721 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
    7865              :                        checkPoint.ThisTimeLineID);
    7866              : 
    7867              :     /*
    7868              :      * Make more log segments if needed.  (Do this after recycling old log
    7869              :      * segments, since that may supply some of the needed files.)
    7870              :      */
    7871         1721 :     if (!shutdown)
    7872          992 :         PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
    7873              : 
    7874              :     /*
    7875              :      * Truncate pg_subtrans if possible.  We can throw away all data before
    7876              :      * the oldest XMIN of any running transaction.  No future transaction will
    7877              :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    7878              :      * in subtrans.c).  During recovery, though, we mustn't do this because
    7879              :      * StartupSUBTRANS hasn't been called yet.
    7880              :      */
    7881         1721 :     if (!RecoveryInProgress())
    7882         1691 :         TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
    7883              : 
    7884              :     /* Real work is done; log and update stats. */
    7885         1721 :     LogCheckpointEnd(false, flags);
    7886              : 
    7887              :     /* Reset the process title */
    7888         1721 :     update_checkpoint_display(flags, false, true);
    7889              : 
    7890              :     TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
    7891              :                                      NBuffers,
    7892              :                                      CheckpointStats.ckpt_segs_added,
    7893              :                                      CheckpointStats.ckpt_segs_removed,
    7894              :                                      CheckpointStats.ckpt_segs_recycled);
    7895              : 
    7896         1721 :     return true;
    7897              : }
    7898              : 
    7899              : /*
    7900              :  * Mark the end of recovery in WAL though without running a full checkpoint.
    7901              :  * We can expect that a restartpoint is likely to be in progress as we
    7902              :  * do this, though we are unwilling to wait for it to complete.
    7903              :  *
    7904              :  * CreateRestartPoint() allows for the case where recovery may end before
    7905              :  * the restartpoint completes so there is no concern of concurrent behaviour.
    7906              :  */
    7907              : static void
    7908           48 : CreateEndOfRecoveryRecord(void)
    7909              : {
    7910              :     xl_end_of_recovery xlrec;
    7911              :     XLogRecPtr  recptr;
    7912              : 
    7913              :     /* sanity check */
    7914           48 :     if (!RecoveryInProgress())
    7915            0 :         elog(ERROR, "can only be used to end recovery");
    7916              : 
    7917           48 :     xlrec.end_time = GetCurrentTimestamp();
    7918           48 :     xlrec.wal_level = wal_level;
    7919              : 
    7920           48 :     WALInsertLockAcquireExclusive();
    7921           48 :     xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
    7922           48 :     xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    7923           48 :     WALInsertLockRelease();
    7924              : 
    7925           48 :     START_CRIT_SECTION();
    7926              : 
    7927           48 :     XLogBeginInsert();
    7928           48 :     XLogRegisterData(&xlrec, sizeof(xl_end_of_recovery));
    7929           48 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
    7930              : 
    7931           48 :     XLogFlush(recptr);
    7932              : 
    7933              :     /*
    7934              :      * Update the control file so that crash recovery can follow the timeline
    7935              :      * changes to this point.
    7936              :      */
    7937           48 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7938           48 :     ControlFile->minRecoveryPoint = recptr;
    7939           48 :     ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
    7940              : 
    7941              :     /* start with the latest checksum version (as of the end of recovery) */
    7942           48 :     SpinLockAcquire(&XLogCtl->info_lck);
    7943           48 :     ControlFile->data_checksum_version = XLogCtl->data_checksum_version;
    7944           48 :     SpinLockRelease(&XLogCtl->info_lck);
    7945              : 
    7946           48 :     UpdateControlFile();
    7947           48 :     LWLockRelease(ControlFileLock);
    7948              : 
    7949           48 :     END_CRIT_SECTION();
    7950           48 : }
    7951              : 
    7952              : /*
    7953              :  * Write an OVERWRITE_CONTRECORD message.
    7954              :  *
    7955              :  * When on WAL replay we expect a continuation record at the start of a page
    7956              :  * that is not there, recovery ends and WAL writing resumes at that point.
    7957              :  * But it's wrong to resume writing new WAL back at the start of the record
    7958              :  * that was broken, because downstream consumers of that WAL (physical
    7959              :  * replicas) are not prepared to "rewind".  So the first action after
    7960              :  * finishing replay of all valid WAL must be to write a record of this type
    7961              :  * at the point where the contrecord was missing; to support xlogreader
    7962              :  * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
    7963              :  * to the page header where the record occurs.  xlogreader has an ad-hoc
    7964              :  * mechanism to report metadata about the broken record, which is what we
    7965              :  * use here.
    7966              :  *
    7967              :  * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
    7968              :  * skip the record it was reading, and pass back the LSN of the skipped
    7969              :  * record, so that its caller can verify (on "replay" of that record) that the
    7970              :  * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
    7971              :  *
    7972              :  * 'aborted_lsn' is the beginning position of the record that was incomplete.
    7973              :  * It is included in the WAL record.  'pagePtr' and 'newTLI' point to the
    7974              :  * beginning of the XLOG page where the record is to be inserted.  They must
    7975              :  * match the current WAL insert position, they're passed here just so that we
    7976              :  * can verify that.
    7977              :  */
    7978              : static XLogRecPtr
    7979           10 : CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
    7980              :                                 TimeLineID newTLI)
    7981              : {
    7982              :     xl_overwrite_contrecord xlrec;
    7983              :     XLogRecPtr  recptr;
    7984              :     XLogPageHeader pagehdr;
    7985              :     XLogRecPtr  startPos;
    7986              : 
    7987              :     /* sanity checks */
    7988           10 :     if (!RecoveryInProgress())
    7989            0 :         elog(ERROR, "can only be used at end of recovery");
    7990           10 :     if (pagePtr % XLOG_BLCKSZ != 0)
    7991            0 :         elog(ERROR, "invalid position for missing continuation record %X/%08X",
    7992              :              LSN_FORMAT_ARGS(pagePtr));
    7993              : 
    7994              :     /* The current WAL insert position should be right after the page header */
    7995           10 :     startPos = pagePtr;
    7996           10 :     if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
    7997            1 :         startPos += SizeOfXLogLongPHD;
    7998              :     else
    7999            9 :         startPos += SizeOfXLogShortPHD;
    8000           10 :     recptr = GetXLogInsertRecPtr();
    8001           10 :     if (recptr != startPos)
    8002            0 :         elog(ERROR, "invalid WAL insert position %X/%08X for OVERWRITE_CONTRECORD",
    8003              :              LSN_FORMAT_ARGS(recptr));
    8004              : 
    8005           10 :     START_CRIT_SECTION();
    8006              : 
    8007              :     /*
    8008              :      * Initialize the XLOG page header (by GetXLogBuffer), and set the
    8009              :      * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
    8010              :      *
    8011              :      * No other backend is allowed to write WAL yet, so acquiring the WAL
    8012              :      * insertion lock is just pro forma.
    8013              :      */
    8014           10 :     WALInsertLockAcquire();
    8015           10 :     pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
    8016           10 :     pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
    8017           10 :     WALInsertLockRelease();
    8018              : 
    8019              :     /*
    8020              :      * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
    8021              :      * page.  We know it becomes the first record, because no other backend is
    8022              :      * allowed to write WAL yet.
    8023              :      */
    8024           10 :     XLogBeginInsert();
    8025           10 :     xlrec.overwritten_lsn = aborted_lsn;
    8026           10 :     xlrec.overwrite_time = GetCurrentTimestamp();
    8027           10 :     XLogRegisterData(&xlrec, sizeof(xl_overwrite_contrecord));
    8028           10 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
    8029              : 
    8030              :     /* check that the record was inserted to the right place */
    8031           10 :     if (ProcLastRecPtr != startPos)
    8032            0 :         elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%08X",
    8033              :              LSN_FORMAT_ARGS(ProcLastRecPtr));
    8034              : 
    8035           10 :     XLogFlush(recptr);
    8036              : 
    8037           10 :     END_CRIT_SECTION();
    8038              : 
    8039           10 :     return recptr;
    8040              : }
    8041              : 
    8042              : /*
    8043              :  * Flush all data in shared memory to disk, and fsync
    8044              :  *
    8045              :  * This is the common code shared between regular checkpoints and
    8046              :  * recovery restartpoints.
    8047              :  */
    8048              : static void
    8049         1931 : CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
    8050              : {
    8051         1931 :     CheckPointRelationMap();
    8052         1931 :     CheckPointReplicationSlots(flags & CHECKPOINT_IS_SHUTDOWN);
    8053         1931 :     CheckPointSnapBuild();
    8054         1931 :     CheckPointLogicalRewriteHeap();
    8055         1931 :     CheckPointReplicationOrigin();
    8056              : 
    8057              :     /* Write out all dirty data in SLRUs and the main buffer pool */
    8058              :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
    8059         1931 :     CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
    8060         1931 :     CheckPointCLOG();
    8061         1931 :     CheckPointCommitTs();
    8062         1931 :     CheckPointSUBTRANS();
    8063         1931 :     CheckPointMultiXact();
    8064         1931 :     CheckPointPredicate();
    8065         1931 :     CheckPointBuffers(flags);
    8066              : 
    8067              :     /* Perform all queued up fsyncs */
    8068              :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
    8069         1931 :     CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
    8070         1931 :     ProcessSyncRequests();
    8071         1931 :     CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
    8072              :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
    8073              : 
    8074              :     /* We deliberately delay 2PC checkpointing as long as possible */
    8075         1931 :     CheckPointTwoPhase(checkPointRedo);
    8076         1931 : }
    8077              : 
    8078              : /*
    8079              :  * Save a checkpoint for recovery restart if appropriate
    8080              :  *
    8081              :  * This function is called each time a checkpoint record is read from XLOG.
    8082              :  * It must determine whether the checkpoint represents a safe restartpoint or
    8083              :  * not.  If so, the checkpoint record is stashed in shared memory so that
    8084              :  * CreateRestartPoint can consult it.  (Note that the latter function is
    8085              :  * executed by the checkpointer, while this one will be executed by the
    8086              :  * startup process.)
    8087              :  */
    8088              : static void
    8089          743 : RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
    8090              : {
    8091              :     /*
    8092              :      * Also refrain from creating a restartpoint if we have seen any
    8093              :      * references to non-existent pages. Restarting recovery from the
    8094              :      * restartpoint would not see the references, so we would lose the
    8095              :      * cross-check that the pages belonged to a relation that was dropped
    8096              :      * later.
    8097              :      */
    8098          743 :     if (XLogHaveInvalidPages())
    8099              :     {
    8100            0 :         elog(DEBUG2,
    8101              :              "could not record restart point at %X/%08X because there are unresolved references to invalid pages",
    8102              :              LSN_FORMAT_ARGS(checkPoint->redo));
    8103            0 :         return;
    8104              :     }
    8105              : 
    8106              :     /*
    8107              :      * Copy the checkpoint record to shared memory, so that checkpointer can
    8108              :      * work out the next time it wants to perform a restartpoint.
    8109              :      */
    8110          743 :     SpinLockAcquire(&XLogCtl->info_lck);
    8111          743 :     XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
    8112          743 :     XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
    8113          743 :     XLogCtl->lastCheckPoint = *checkPoint;
    8114          743 :     SpinLockRelease(&XLogCtl->info_lck);
    8115              : }
    8116              : 
    8117              : /*
    8118              :  * Establish a restartpoint if possible.
    8119              :  *
    8120              :  * This is similar to CreateCheckPoint, but is used during WAL recovery
    8121              :  * to establish a point from which recovery can roll forward without
    8122              :  * replaying the entire recovery log.
    8123              :  *
    8124              :  * Returns true if a new restartpoint was established. We can only establish
    8125              :  * a restartpoint if we have replayed a safe checkpoint record since last
    8126              :  * restartpoint.
    8127              :  */
    8128              : bool
    8129          614 : CreateRestartPoint(int flags)
    8130              : {
    8131              :     XLogRecPtr  lastCheckPointRecPtr;
    8132              :     XLogRecPtr  lastCheckPointEndPtr;
    8133              :     CheckPoint  lastCheckPoint;
    8134              :     XLogRecPtr  PriorRedoPtr;
    8135              :     XLogRecPtr  receivePtr;
    8136              :     XLogRecPtr  replayPtr;
    8137              :     TimeLineID  replayTLI;
    8138              :     XLogRecPtr  endptr;
    8139              :     XLogSegNo   _logSegNo;
    8140              :     TimestampTz xtime;
    8141              : 
    8142              :     /* Concurrent checkpoint/restartpoint cannot happen */
    8143              :     Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
    8144              : 
    8145              :     /* Get a local copy of the last safe checkpoint record. */
    8146          614 :     SpinLockAcquire(&XLogCtl->info_lck);
    8147          614 :     lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
    8148          614 :     lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
    8149          614 :     lastCheckPoint = XLogCtl->lastCheckPoint;
    8150          614 :     SpinLockRelease(&XLogCtl->info_lck);
    8151              : 
    8152              :     /*
    8153              :      * Check that we're still in recovery mode. It's ok if we exit recovery
    8154              :      * mode after this check, the restart point is valid anyway.
    8155              :      */
    8156          614 :     if (!RecoveryInProgress())
    8157              :     {
    8158            0 :         ereport(DEBUG2,
    8159              :                 (errmsg_internal("skipping restartpoint, recovery has already ended")));
    8160            0 :         return false;
    8161              :     }
    8162              : 
    8163              :     /*
    8164              :      * If the last checkpoint record we've replayed is already our last
    8165              :      * restartpoint, we can't perform a new restart point. We still update
    8166              :      * minRecoveryPoint in that case, so that if this is a shutdown restart
    8167              :      * point, we won't start up earlier than before. That's not strictly
    8168              :      * necessary, but when hot standby is enabled, it would be rather weird if
    8169              :      * the database opened up for read-only connections at a point-in-time
    8170              :      * before the last shutdown. Such time travel is still possible in case of
    8171              :      * immediate shutdown, though.
    8172              :      *
    8173              :      * We don't explicitly advance minRecoveryPoint when we do create a
    8174              :      * restartpoint. It's assumed that flushing the buffers will do that as a
    8175              :      * side-effect.
    8176              :      */
    8177          614 :     if (!XLogRecPtrIsValid(lastCheckPointRecPtr) ||
    8178          283 :         lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
    8179              :     {
    8180          404 :         ereport(DEBUG2,
    8181              :                 errmsg_internal("skipping restartpoint, already performed at %X/%08X",
    8182              :                                 LSN_FORMAT_ARGS(lastCheckPoint.redo)));
    8183              : 
    8184          404 :         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    8185          404 :         if (flags & CHECKPOINT_IS_SHUTDOWN)
    8186              :         {
    8187           36 :             LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8188           36 :             ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    8189           36 :             UpdateControlFile();
    8190           36 :             LWLockRelease(ControlFileLock);
    8191              :         }
    8192          404 :         return false;
    8193              :     }
    8194              : 
    8195              :     /*
    8196              :      * Update the shared RedoRecPtr so that the startup process can calculate
    8197              :      * the number of segments replayed since last restartpoint, and request a
    8198              :      * restartpoint if it exceeds CheckPointSegments.
    8199              :      *
    8200              :      * Like in CreateCheckPoint(), hold off insertions to update it, although
    8201              :      * during recovery this is just pro forma, because no WAL insertions are
    8202              :      * happening.
    8203              :      */
    8204          210 :     WALInsertLockAcquireExclusive();
    8205          210 :     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
    8206          210 :     WALInsertLockRelease();
    8207              : 
    8208              :     /* Also update the info_lck-protected copy */
    8209          210 :     SpinLockAcquire(&XLogCtl->info_lck);
    8210          210 :     XLogCtl->RedoRecPtr = lastCheckPoint.redo;
    8211          210 :     SpinLockRelease(&XLogCtl->info_lck);
    8212              : 
    8213              :     /*
    8214              :      * Prepare to accumulate statistics.
    8215              :      *
    8216              :      * Note: because it is possible for log_checkpoints to change while a
    8217              :      * checkpoint proceeds, we always accumulate stats, even if
    8218              :      * log_checkpoints is currently off.
    8219              :      */
    8220         2310 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    8221          210 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    8222              : 
    8223          210 :     if (log_checkpoints)
    8224          210 :         LogCheckpointStart(flags, true);
    8225              : 
    8226              :     /* Update the process title */
    8227          210 :     update_checkpoint_display(flags, true, false);
    8228              : 
    8229          210 :     CheckPointGuts(lastCheckPoint.redo, flags);
    8230              : 
    8231              :     /*
    8232              :      * This location needs to be after CheckPointGuts() to ensure that some
    8233              :      * work has already happened during this checkpoint.
    8234              :      */
    8235          210 :     INJECTION_POINT("create-restart-point", NULL);
    8236              : 
    8237              :     /*
    8238              :      * Remember the prior checkpoint's redo ptr for
    8239              :      * UpdateCheckPointDistanceEstimate()
    8240              :      */
    8241          210 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    8242              : 
    8243              :     /*
    8244              :      * Update pg_control, using current time.  Check that it still shows an
    8245              :      * older checkpoint, else do nothing; this is a quick hack to make sure
    8246              :      * nothing really bad happens if somehow we get here after the
    8247              :      * end-of-recovery checkpoint.
    8248              :      */
    8249          210 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8250          210 :     if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
    8251              :     {
    8252              :         /*
    8253              :          * Update the checkpoint information.  We do this even if the cluster
    8254              :          * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL
    8255              :          * segments recycled below.
    8256              :          */
    8257          210 :         ControlFile->checkPoint = lastCheckPointRecPtr;
    8258          210 :         ControlFile->checkPointCopy = lastCheckPoint;
    8259              : 
    8260              :         /*
    8261              :          * Ensure minRecoveryPoint is past the checkpoint record and update it
    8262              :          * if the control file still shows DB_IN_ARCHIVE_RECOVERY.  Normally,
    8263              :          * this will have happened already while writing out dirty buffers,
    8264              :          * but not necessarily - e.g. because no buffers were dirtied.  We do
    8265              :          * this because a backup performed in recovery uses minRecoveryPoint
    8266              :          * to determine which WAL files must be included in the backup, and
    8267              :          * the file (or files) containing the checkpoint record must be
    8268              :          * included, at a minimum.  Note that for an ordinary restart of
    8269              :          * recovery there's no value in having the minimum recovery point any
    8270              :          * earlier than this anyway, because redo will begin just after the
    8271              :          * checkpoint record.
    8272              :          */
    8273          210 :         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
    8274              :         {
    8275          210 :             if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
    8276              :             {
    8277           20 :                 ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
    8278           20 :                 ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
    8279              : 
    8280              :                 /* update local copy */
    8281           20 :                 LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    8282           20 :                 LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    8283              :             }
    8284          210 :             if (flags & CHECKPOINT_IS_SHUTDOWN)
    8285           24 :                 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    8286              :         }
    8287              : 
    8288              :         /* we shall start with the latest checksum version */
    8289          210 :         ControlFile->data_checksum_version = lastCheckPoint.dataChecksumState;
    8290              : 
    8291          210 :         UpdateControlFile();
    8292              :     }
    8293          210 :     LWLockRelease(ControlFileLock);
    8294              : 
    8295              :     /*
    8296              :      * Update the average distance between checkpoints/restartpoints if the
    8297              :      * prior checkpoint exists.
    8298              :      */
    8299          210 :     if (XLogRecPtrIsValid(PriorRedoPtr))
    8300          210 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    8301              : 
    8302              :     /*
    8303              :      * Delete old log files, those no longer needed for last restartpoint to
    8304              :      * prevent the disk holding the xlog from growing full.
    8305              :      */
    8306          210 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    8307              : 
    8308              :     /*
    8309              :      * Retreat _logSegNo using the current end of xlog replayed or received,
    8310              :      * whichever is later.
    8311              :      */
    8312          210 :     receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
    8313          210 :     replayPtr = GetXLogReplayRecPtr(&replayTLI);
    8314          210 :     endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
    8315          210 :     KeepLogSeg(endptr, &_logSegNo);
    8316              : 
    8317          210 :     INJECTION_POINT("restartpoint-before-slot-invalidation", NULL);
    8318              : 
    8319          210 :     if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED | RS_INVAL_IDLE_TIMEOUT,
    8320              :                                            _logSegNo, InvalidOid,
    8321              :                                            InvalidTransactionId))
    8322              :     {
    8323              :         /*
    8324              :          * Some slots have been invalidated; recalculate the old-segment
    8325              :          * horizon, starting again from RedoRecPtr.
    8326              :          */
    8327            1 :         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    8328            1 :         KeepLogSeg(endptr, &_logSegNo);
    8329              :     }
    8330          210 :     _logSegNo--;
    8331              : 
    8332              :     /*
    8333              :      * Try to recycle segments on a useful timeline. If we've been promoted
    8334              :      * since the beginning of this restartpoint, use the new timeline chosen
    8335              :      * at end of recovery.  If we're still in recovery, use the timeline we're
    8336              :      * currently replaying.
    8337              :      *
    8338              :      * There is no guarantee that the WAL segments will be useful on the
    8339              :      * current timeline; if recovery proceeds to a new timeline right after
    8340              :      * this, the pre-allocated WAL segments on this timeline will not be used,
    8341              :      * and will go wasted until recycled on the next restartpoint. We'll live
    8342              :      * with that.
    8343              :      */
    8344          210 :     if (!RecoveryInProgress())
    8345            0 :         replayTLI = XLogCtl->InsertTimeLineID;
    8346              : 
    8347          210 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
    8348              : 
    8349              :     /*
    8350              :      * Make more log segments if needed.  (Do this after recycling old log
    8351              :      * segments, since that may supply some of the needed files.)
    8352              :      */
    8353          210 :     PreallocXlogFiles(endptr, replayTLI);
    8354              : 
    8355              :     /*
    8356              :      * Truncate pg_subtrans if possible.  We can throw away all data before
    8357              :      * the oldest XMIN of any running transaction.  No future transaction will
    8358              :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    8359              :      * in subtrans.c).  When hot standby is disabled, though, we mustn't do
    8360              :      * this because StartupSUBTRANS hasn't been called yet.
    8361              :      */
    8362          210 :     if (EnableHotStandby)
    8363          210 :         TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
    8364              : 
    8365              :     /* Real work is done; log and update stats. */
    8366          210 :     LogCheckpointEnd(true, flags);
    8367              : 
    8368              :     /* Reset the process title */
    8369          210 :     update_checkpoint_display(flags, true, true);
    8370              : 
    8371          210 :     xtime = GetLatestXTime();
    8372          210 :     ereport((log_checkpoints ? LOG : DEBUG2),
    8373              :             errmsg("recovery restart point at %X/%08X",
    8374              :                    LSN_FORMAT_ARGS(lastCheckPoint.redo)),
    8375              :             xtime ? errdetail("Last completed transaction was at log time %s.",
    8376              :                               timestamptz_to_str(xtime)) : 0);
    8377              : 
    8378              :     /*
    8379              :      * Finally, execute archive_cleanup_command, if any.
    8380              :      */
    8381          210 :     if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
    8382            0 :         ExecuteRecoveryCommand(archiveCleanupCommand,
    8383              :                                "archive_cleanup_command",
    8384              :                                false,
    8385              :                                WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
    8386              : 
    8387          210 :     return true;
    8388              : }
    8389              : 
    8390              : /*
    8391              :  * Report availability of WAL for the given target LSN
    8392              :  *      (typically a slot's restart_lsn)
    8393              :  *
    8394              :  * Returns one of the following enum values:
    8395              :  *
    8396              :  * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
    8397              :  *   max_wal_size.
    8398              :  *
    8399              :  * * WALAVAIL_EXTENDED means it is still available by preserving extra
    8400              :  *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
    8401              :  *   than max_wal_size, this state is not returned.
    8402              :  *
    8403              :  * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
    8404              :  *   remove reserved segments. The walsender using this slot may return to the
    8405              :  *   above.
    8406              :  *
    8407              :  * * WALAVAIL_REMOVED means it has been removed. A replication stream on
    8408              :  *   a slot with this LSN cannot continue.  (Any associated walsender
    8409              :  *   processes should have been terminated already.)
    8410              :  *
    8411              :  * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
    8412              :  */
    8413              : WALAvailability
    8414          628 : GetWALAvailability(XLogRecPtr targetLSN)
    8415              : {
    8416              :     XLogRecPtr  currpos;        /* current write LSN */
    8417              :     XLogSegNo   currSeg;        /* segid of currpos */
    8418              :     XLogSegNo   targetSeg;      /* segid of targetLSN */
    8419              :     XLogSegNo   oldestSeg;      /* actual oldest segid */
    8420              :     XLogSegNo   oldestSegMaxWalSize;    /* oldest segid kept by max_wal_size */
    8421              :     XLogSegNo   oldestSlotSeg;  /* oldest segid kept by slot */
    8422              :     uint64      keepSegs;
    8423              : 
    8424              :     /*
    8425              :      * slot does not reserve WAL. Either deactivated, or has never been active
    8426              :      */
    8427          628 :     if (!XLogRecPtrIsValid(targetLSN))
    8428           27 :         return WALAVAIL_INVALID_LSN;
    8429              : 
    8430              :     /*
    8431              :      * Calculate the oldest segment currently reserved by all slots,
    8432              :      * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
    8433              :      * oldestSlotSeg to the current segment.
    8434              :      */
    8435          601 :     currpos = GetXLogWriteRecPtr();
    8436          601 :     XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
    8437          601 :     KeepLogSeg(currpos, &oldestSlotSeg);
    8438              : 
    8439              :     /*
    8440              :      * Find the oldest extant segment file. We get 1 until checkpoint removes
    8441              :      * the first WAL segment file since startup, which causes the status being
    8442              :      * wrong under certain abnormal conditions but that doesn't actually harm.
    8443              :      */
    8444          601 :     oldestSeg = XLogGetLastRemovedSegno() + 1;
    8445              : 
    8446              :     /* calculate oldest segment by max_wal_size */
    8447          601 :     XLByteToSeg(currpos, currSeg, wal_segment_size);
    8448          601 :     keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
    8449              : 
    8450          601 :     if (currSeg > keepSegs)
    8451            8 :         oldestSegMaxWalSize = currSeg - keepSegs;
    8452              :     else
    8453          593 :         oldestSegMaxWalSize = 1;
    8454              : 
    8455              :     /* the segment we care about */
    8456          601 :     XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
    8457              : 
    8458              :     /*
    8459              :      * No point in returning reserved or extended status values if the
    8460              :      * targetSeg is known to be lost.
    8461              :      */
    8462          601 :     if (targetSeg >= oldestSlotSeg)
    8463              :     {
    8464              :         /* show "reserved" when targetSeg is within max_wal_size */
    8465          600 :         if (targetSeg >= oldestSegMaxWalSize)
    8466          598 :             return WALAVAIL_RESERVED;
    8467              : 
    8468              :         /* being retained by slots exceeding max_wal_size */
    8469            2 :         return WALAVAIL_EXTENDED;
    8470              :     }
    8471              : 
    8472              :     /* WAL segments are no longer retained but haven't been removed yet */
    8473            1 :     if (targetSeg >= oldestSeg)
    8474            1 :         return WALAVAIL_UNRESERVED;
    8475              : 
    8476              :     /* Definitely lost */
    8477            0 :     return WALAVAIL_REMOVED;
    8478              : }
    8479              : 
    8480              : 
    8481              : /*
    8482              :  * Retreat *logSegNo to the last segment that we need to retain because of
    8483              :  * either wal_keep_size or replication slots.
    8484              :  *
    8485              :  * This is calculated by subtracting wal_keep_size from the given xlog
    8486              :  * location, recptr and by making sure that that result is below the
    8487              :  * requirement of replication slots.  For the latter criterion we do consider
    8488              :  * the effects of max_slot_wal_keep_size: reserve at most that much space back
    8489              :  * from recptr.
    8490              :  *
    8491              :  * Note about replication slots: if this function calculates a value
    8492              :  * that's further ahead than what slots need reserved, then affected
    8493              :  * slots need to be invalidated and this function invoked again.
    8494              :  * XXX it might be a good idea to rewrite this function so that
    8495              :  * invalidation is optionally done here, instead.
    8496              :  */
    8497              : static void
    8498         2537 : KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
    8499              : {
    8500              :     XLogSegNo   currSegNo;
    8501              :     XLogSegNo   segno;
    8502              :     XLogRecPtr  keep;
    8503              : 
    8504         2537 :     XLByteToSeg(recptr, currSegNo, wal_segment_size);
    8505         2537 :     segno = currSegNo;
    8506              : 
    8507              :     /* Calculate how many segments are kept by slots. */
    8508         2537 :     keep = XLogGetReplicationSlotMinimumLSN();
    8509         2537 :     if (XLogRecPtrIsValid(keep) && keep < recptr)
    8510              :     {
    8511          770 :         XLByteToSeg(keep, segno, wal_segment_size);
    8512              : 
    8513              :         /*
    8514              :          * Account for max_slot_wal_keep_size to avoid keeping more than
    8515              :          * configured.  However, don't do that during a binary upgrade: if
    8516              :          * slots were to be invalidated because of this, it would not be
    8517              :          * possible to preserve logical ones during the upgrade.
    8518              :          */
    8519          770 :         if (max_slot_wal_keep_size_mb >= 0 && !IsBinaryUpgrade)
    8520              :         {
    8521              :             uint64      slot_keep_segs;
    8522              : 
    8523           23 :             slot_keep_segs =
    8524           23 :                 ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
    8525              : 
    8526           23 :             if (currSegNo - segno > slot_keep_segs)
    8527            6 :                 segno = currSegNo - slot_keep_segs;
    8528              :         }
    8529              :     }
    8530              : 
    8531              :     /*
    8532              :      * If WAL summarization is in use, don't remove WAL that has yet to be
    8533              :      * summarized.
    8534              :      */
    8535         2537 :     keep = GetOldestUnsummarizedLSN(NULL, NULL);
    8536         2537 :     if (XLogRecPtrIsValid(keep))
    8537              :     {
    8538              :         XLogSegNo   unsummarized_segno;
    8539              : 
    8540            7 :         XLByteToSeg(keep, unsummarized_segno, wal_segment_size);
    8541            7 :         if (unsummarized_segno < segno)
    8542            7 :             segno = unsummarized_segno;
    8543              :     }
    8544              : 
    8545              :     /* but, keep at least wal_keep_size if that's set */
    8546         2537 :     if (wal_keep_size_mb > 0)
    8547              :     {
    8548              :         uint64      keep_segs;
    8549              : 
    8550           74 :         keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
    8551           74 :         if (currSegNo - segno < keep_segs)
    8552              :         {
    8553              :             /* avoid underflow, don't go below 1 */
    8554           74 :             if (currSegNo <= keep_segs)
    8555           70 :                 segno = 1;
    8556              :             else
    8557            4 :                 segno = currSegNo - keep_segs;
    8558              :         }
    8559              :     }
    8560              : 
    8561              :     /* don't delete WAL segments newer than the calculated segment */
    8562         2537 :     if (segno < *logSegNo)
    8563          362 :         *logSegNo = segno;
    8564         2537 : }
    8565              : 
    8566              : /*
    8567              :  * Write a NEXTOID log record
    8568              :  */
    8569              : void
    8570          690 : XLogPutNextOid(Oid nextOid)
    8571              : {
    8572          690 :     XLogBeginInsert();
    8573          690 :     XLogRegisterData(&nextOid, sizeof(Oid));
    8574          690 :     (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
    8575              : 
    8576              :     /*
    8577              :      * We need not flush the NEXTOID record immediately, because any of the
    8578              :      * just-allocated OIDs could only reach disk as part of a tuple insert or
    8579              :      * update that would have its own XLOG record that must follow the NEXTOID
    8580              :      * record.  Therefore, the standard buffer LSN interlock applied to those
    8581              :      * records will ensure no such OID reaches disk before the NEXTOID record
    8582              :      * does.
    8583              :      *
    8584              :      * Note, however, that the above statement only covers state "within" the
    8585              :      * database.  When we use a generated OID as a file or directory name, we
    8586              :      * are in a sense violating the basic WAL rule, because that filesystem
    8587              :      * change may reach disk before the NEXTOID WAL record does.  The impact
    8588              :      * of this is that if a database crash occurs immediately afterward, we
    8589              :      * might after restart re-generate the same OID and find that it conflicts
    8590              :      * with the leftover file or directory.  But since for safety's sake we
    8591              :      * always loop until finding a nonconflicting filename, this poses no real
    8592              :      * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
    8593              :      */
    8594          690 : }
    8595              : 
    8596              : /*
    8597              :  * Write an XLOG SWITCH record.
    8598              :  *
    8599              :  * Here we just blindly issue an XLogInsert request for the record.
    8600              :  * All the magic happens inside XLogInsert.
    8601              :  *
    8602              :  * The return value is either the end+1 address of the switch record,
    8603              :  * or the end+1 address of the prior segment if we did not need to
    8604              :  * write a switch record because we are already at segment start.
    8605              :  */
    8606              : XLogRecPtr
    8607          814 : RequestXLogSwitch(bool mark_unimportant)
    8608              : {
    8609              :     XLogRecPtr  RecPtr;
    8610              : 
    8611              :     /* XLOG SWITCH has no data */
    8612          814 :     XLogBeginInsert();
    8613              : 
    8614          814 :     if (mark_unimportant)
    8615            0 :         XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    8616          814 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
    8617              : 
    8618          814 :     return RecPtr;
    8619              : }
    8620              : 
    8621              : /*
    8622              :  * Write a RESTORE POINT record
    8623              :  */
    8624              : XLogRecPtr
    8625            3 : XLogRestorePoint(const char *rpName)
    8626              : {
    8627              :     XLogRecPtr  RecPtr;
    8628              :     xl_restore_point xlrec;
    8629              : 
    8630            3 :     xlrec.rp_time = GetCurrentTimestamp();
    8631            3 :     strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
    8632              : 
    8633            3 :     XLogBeginInsert();
    8634            3 :     XLogRegisterData(&xlrec, sizeof(xl_restore_point));
    8635              : 
    8636            3 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
    8637              : 
    8638            3 :     ereport(LOG,
    8639              :             errmsg("restore point \"%s\" created at %X/%08X",
    8640              :                    rpName, LSN_FORMAT_ARGS(RecPtr)));
    8641              : 
    8642            3 :     return RecPtr;
    8643              : }
    8644              : 
    8645              : /*
    8646              :  * Write an empty XLOG record to assign a distinct LSN.
    8647              :  *
    8648              :  * This is used by some index AMs when building indexes on permanent relations
    8649              :  * with wal_level=minimal.  In that scenario, WAL-logging will start after
    8650              :  * commit, but the index AM needs distinct LSNs to detect concurrent page
    8651              :  * modifications.  When the current WAL insert position hasn't advanced since
    8652              :  * the last call, we emit a dummy record to ensure we get a new, distinct LSN.
    8653              :  */
    8654              : XLogRecPtr
    8655       112021 : XLogAssignLSN(void)
    8656              : {
    8657       112021 :     int         dummy = 0;
    8658              : 
    8659              :     /*
    8660              :      * Records other than XLOG_SWITCH must have content.  We use an integer 0
    8661              :      * to satisfy this restriction.
    8662              :      */
    8663       112021 :     XLogBeginInsert();
    8664       112021 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    8665       112021 :     XLogRegisterData(&dummy, sizeof(dummy));
    8666       112021 :     return XLogInsert(RM_XLOG_ID, XLOG_ASSIGN_LSN);
    8667              : }
    8668              : 
    8669              : /*
    8670              :  * Check if any of the GUC parameters that are critical for hot standby
    8671              :  * have changed, and update the value in pg_control file if necessary.
    8672              :  */
    8673              : static void
    8674         1007 : XLogReportParameters(void)
    8675              : {
    8676         1007 :     if (wal_level != ControlFile->wal_level ||
    8677          733 :         wal_log_hints != ControlFile->wal_log_hints ||
    8678          644 :         MaxConnections != ControlFile->MaxConnections ||
    8679          643 :         max_worker_processes != ControlFile->max_worker_processes ||
    8680          640 :         max_wal_senders != ControlFile->max_wal_senders ||
    8681          613 :         max_prepared_xacts != ControlFile->max_prepared_xacts ||
    8682          509 :         max_locks_per_xact != ControlFile->max_locks_per_xact ||
    8683          509 :         track_commit_timestamp != ControlFile->track_commit_timestamp)
    8684              :     {
    8685              :         /*
    8686              :          * The change in number of backend slots doesn't need to be WAL-logged
    8687              :          * if archiving is not enabled, as you can't start archive recovery
    8688              :          * with wal_level=minimal anyway. We don't really care about the
    8689              :          * values in pg_control either if wal_level=minimal, but seems better
    8690              :          * to keep them up-to-date to avoid confusion.
    8691              :          */
    8692          510 :         if (wal_level != ControlFile->wal_level || XLogIsNeeded())
    8693              :         {
    8694              :             xl_parameter_change xlrec;
    8695              :             XLogRecPtr  recptr;
    8696              : 
    8697          483 :             xlrec.MaxConnections = MaxConnections;
    8698          483 :             xlrec.max_worker_processes = max_worker_processes;
    8699          483 :             xlrec.max_wal_senders = max_wal_senders;
    8700          483 :             xlrec.max_prepared_xacts = max_prepared_xacts;
    8701          483 :             xlrec.max_locks_per_xact = max_locks_per_xact;
    8702          483 :             xlrec.wal_level = wal_level;
    8703          483 :             xlrec.wal_log_hints = wal_log_hints;
    8704          483 :             xlrec.track_commit_timestamp = track_commit_timestamp;
    8705              : 
    8706          483 :             XLogBeginInsert();
    8707          483 :             XLogRegisterData(&xlrec, sizeof(xlrec));
    8708              : 
    8709          483 :             recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
    8710          483 :             XLogFlush(recptr);
    8711              :         }
    8712              : 
    8713          510 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8714              : 
    8715          510 :         ControlFile->MaxConnections = MaxConnections;
    8716          510 :         ControlFile->max_worker_processes = max_worker_processes;
    8717          510 :         ControlFile->max_wal_senders = max_wal_senders;
    8718          510 :         ControlFile->max_prepared_xacts = max_prepared_xacts;
    8719          510 :         ControlFile->max_locks_per_xact = max_locks_per_xact;
    8720          510 :         ControlFile->wal_level = wal_level;
    8721          510 :         ControlFile->wal_log_hints = wal_log_hints;
    8722          510 :         ControlFile->track_commit_timestamp = track_commit_timestamp;
    8723          510 :         UpdateControlFile();
    8724              : 
    8725          510 :         LWLockRelease(ControlFileLock);
    8726              :     }
    8727         1007 : }
    8728              : 
    8729              : /*
    8730              :  * Log the new state of checksums
    8731              :  */
    8732              : static void
    8733           24 : XLogChecksums(uint32 new_type)
    8734              : {
    8735              :     xl_checksum_state xlrec;
    8736              :     XLogRecPtr  recptr;
    8737              : 
    8738           24 :     xlrec.new_checksum_state = new_type;
    8739              : 
    8740           24 :     XLogBeginInsert();
    8741           24 :     XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state));
    8742              : 
    8743           24 :     recptr = XLogInsert(RM_XLOG2_ID, XLOG2_CHECKSUMS);
    8744           24 :     XLogFlush(recptr);
    8745           24 : }
    8746              : 
    8747              : /*
    8748              :  * Update full_page_writes in shared memory, and write an
    8749              :  * XLOG_FPW_CHANGE record if necessary.
    8750              :  *
    8751              :  * Note: this function assumes there is no other process running
    8752              :  * concurrently that could update it.
    8753              :  */
    8754              : void
    8755         1720 : UpdateFullPageWrites(void)
    8756              : {
    8757         1720 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    8758              :     bool        recoveryInProgress;
    8759              : 
    8760              :     /*
    8761              :      * Do nothing if full_page_writes has not been changed.
    8762              :      *
    8763              :      * It's safe to check the shared full_page_writes without the lock,
    8764              :      * because we assume that there is no concurrently running process which
    8765              :      * can update it.
    8766              :      */
    8767         1720 :     if (fullPageWrites == Insert->fullPageWrites)
    8768         1286 :         return;
    8769              : 
    8770              :     /*
    8771              :      * Perform this outside critical section so that the WAL insert
    8772              :      * initialization done by RecoveryInProgress() doesn't trigger an
    8773              :      * assertion failure.
    8774              :      */
    8775          434 :     recoveryInProgress = RecoveryInProgress();
    8776              : 
    8777          434 :     START_CRIT_SECTION();
    8778              : 
    8779              :     /*
    8780              :      * It's always safe to take full page images, even when not strictly
    8781              :      * required, but not the other round. So if we're setting full_page_writes
    8782              :      * to true, first set it true and then write the WAL record. If we're
    8783              :      * setting it to false, first write the WAL record and then set the global
    8784              :      * flag.
    8785              :      */
    8786          434 :     if (fullPageWrites)
    8787              :     {
    8788          421 :         WALInsertLockAcquireExclusive();
    8789          421 :         Insert->fullPageWrites = true;
    8790          421 :         WALInsertLockRelease();
    8791              :     }
    8792              : 
    8793              :     /*
    8794              :      * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
    8795              :      * full_page_writes during archive recovery, if required.
    8796              :      */
    8797          434 :     if (XLogStandbyInfoActive() && !recoveryInProgress)
    8798              :     {
    8799            0 :         XLogBeginInsert();
    8800            0 :         XLogRegisterData(&fullPageWrites, sizeof(bool));
    8801              : 
    8802            0 :         XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
    8803              :     }
    8804              : 
    8805          434 :     if (!fullPageWrites)
    8806              :     {
    8807           13 :         WALInsertLockAcquireExclusive();
    8808           13 :         Insert->fullPageWrites = false;
    8809           13 :         WALInsertLockRelease();
    8810              :     }
    8811          434 :     END_CRIT_SECTION();
    8812              : }
    8813              : 
    8814              : /*
    8815              :  * XLOG resource manager's routines
    8816              :  *
    8817              :  * Definitions of info values are in include/catalog/pg_control.h, though
    8818              :  * not all record types are related to control file updates.
    8819              :  *
    8820              :  * NOTE: Some XLOG record types that are directly related to WAL recovery
    8821              :  * are handled in xlogrecovery_redo().
    8822              :  */
    8823              : void
    8824       114398 : xlog_redo(XLogReaderState *record)
    8825              : {
    8826       114398 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    8827       114398 :     XLogRecPtr  lsn = record->EndRecPtr;
    8828              : 
    8829              :     /*
    8830              :      * In XLOG rmgr, backup blocks are only used by XLOG_FPI and
    8831              :      * XLOG_FPI_FOR_HINT records.
    8832              :      */
    8833              :     Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
    8834              :            !XLogRecHasAnyBlockRefs(record));
    8835              : 
    8836       114398 :     if (info == XLOG_NEXTOID)
    8837              :     {
    8838              :         Oid         nextOid;
    8839              : 
    8840              :         /*
    8841              :          * We used to try to take the maximum of TransamVariables->nextOid and
    8842              :          * the recorded nextOid, but that fails if the OID counter wraps
    8843              :          * around.  Since no OID allocation should be happening during replay
    8844              :          * anyway, better to just believe the record exactly.  We still take
    8845              :          * OidGenLock while setting the variable, just in case.
    8846              :          */
    8847           97 :         memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
    8848           97 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    8849           97 :         TransamVariables->nextOid = nextOid;
    8850           97 :         TransamVariables->oidCount = 0;
    8851           97 :         LWLockRelease(OidGenLock);
    8852              :     }
    8853       114301 :     else if (info == XLOG_CHECKPOINT_SHUTDOWN)
    8854              :     {
    8855              :         CheckPoint  checkPoint;
    8856              :         TimeLineID  replayTLI;
    8857              : 
    8858           42 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    8859              :         /* In a SHUTDOWN checkpoint, believe the counters exactly */
    8860           42 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    8861           42 :         TransamVariables->nextXid = checkPoint.nextXid;
    8862           42 :         LWLockRelease(XidGenLock);
    8863           42 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    8864           42 :         TransamVariables->nextOid = checkPoint.nextOid;
    8865           42 :         TransamVariables->oidCount = 0;
    8866           42 :         LWLockRelease(OidGenLock);
    8867           42 :         MultiXactSetNextMXact(checkPoint.nextMulti,
    8868              :                               checkPoint.nextMultiOffset);
    8869              : 
    8870           42 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
    8871              :                                checkPoint.oldestMultiDB);
    8872              : 
    8873           42 :         SpinLockAcquire(&XLogCtl->info_lck);
    8874           42 :         XLogCtl->data_checksum_version = checkPoint.dataChecksumState;
    8875           42 :         SetLocalDataChecksumState(checkPoint.dataChecksumState);
    8876           42 :         SpinLockRelease(&XLogCtl->info_lck);
    8877              : 
    8878              :         /*
    8879              :          * No need to set oldestClogXid here as well; it'll be set when we
    8880              :          * redo an xl_clog_truncate if it changed since initialization.
    8881              :          */
    8882           42 :         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    8883              : 
    8884              :         /*
    8885              :          * If we see a shutdown checkpoint while waiting for an end-of-backup
    8886              :          * record, the backup was canceled and the end-of-backup record will
    8887              :          * never arrive.
    8888              :          */
    8889           42 :         if (ArchiveRecoveryRequested &&
    8890           41 :             XLogRecPtrIsValid(ControlFile->backupStartPoint) &&
    8891            0 :             !XLogRecPtrIsValid(ControlFile->backupEndPoint))
    8892            0 :             ereport(PANIC,
    8893              :                     (errmsg("online backup was canceled, recovery cannot continue")));
    8894              : 
    8895              :         /*
    8896              :          * If we see a shutdown checkpoint, we know that nothing was running
    8897              :          * on the primary at this point. So fake-up an empty running-xacts
    8898              :          * record and use that here and now. Recover additional standby state
    8899              :          * for prepared transactions.
    8900              :          */
    8901           42 :         if (standbyState >= STANDBY_INITIALIZED)
    8902              :         {
    8903              :             TransactionId *xids;
    8904              :             int         nxids;
    8905              :             TransactionId oldestActiveXID;
    8906              :             TransactionId latestCompletedXid;
    8907              :             RunningTransactionsData running;
    8908              : 
    8909           39 :             oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    8910              : 
    8911              :             /* Update pg_subtrans entries for any prepared transactions */
    8912           39 :             StandbyRecoverPreparedTransactions();
    8913              : 
    8914              :             /*
    8915              :              * Construct a RunningTransactions snapshot representing a shut
    8916              :              * down server, with only prepared transactions still alive. We're
    8917              :              * never overflowed at this point because all subxids are listed
    8918              :              * with their parent prepared transactions.
    8919              :              */
    8920           39 :             running.xcnt = nxids;
    8921           39 :             running.subxcnt = 0;
    8922           39 :             running.subxid_status = SUBXIDS_IN_SUBTRANS;
    8923           39 :             running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
    8924           39 :             running.oldestRunningXid = oldestActiveXID;
    8925           39 :             latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
    8926           39 :             TransactionIdRetreat(latestCompletedXid);
    8927              :             Assert(TransactionIdIsNormal(latestCompletedXid));
    8928           39 :             running.latestCompletedXid = latestCompletedXid;
    8929           39 :             running.xids = xids;
    8930              : 
    8931           39 :             ProcArrayApplyRecoveryInfo(&running);
    8932              :         }
    8933              : 
    8934              :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    8935           42 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8936           42 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    8937           42 :         ControlFile->data_checksum_version = checkPoint.dataChecksumState;
    8938           42 :         LWLockRelease(ControlFileLock);
    8939              : 
    8940              :         /*
    8941              :          * We should've already switched to the new TLI before replaying this
    8942              :          * record.
    8943              :          */
    8944           42 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    8945           42 :         if (checkPoint.ThisTimeLineID != replayTLI)
    8946            0 :             ereport(PANIC,
    8947              :                     (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
    8948              :                             checkPoint.ThisTimeLineID, replayTLI)));
    8949              : 
    8950           42 :         RecoveryRestartPoint(&checkPoint, record);
    8951              : 
    8952              :         /*
    8953              :          * After replaying a checkpoint record, free all smgr objects.
    8954              :          * Otherwise we would never do so for dropped relations, as the
    8955              :          * startup does not process shared invalidation messages or call
    8956              :          * AtEOXact_SMgr().
    8957              :          */
    8958           42 :         smgrdestroyall();
    8959              :     }
    8960       114259 :     else if (info == XLOG_CHECKPOINT_ONLINE)
    8961              :     {
    8962              :         CheckPoint  checkPoint;
    8963              :         TimeLineID  replayTLI;
    8964          701 :         bool        new_state = false;
    8965              :         int         old_state;
    8966              : 
    8967          701 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    8968              :         /* In an ONLINE checkpoint, treat the XID counter as a minimum */
    8969          701 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    8970          701 :         if (FullTransactionIdPrecedes(TransamVariables->nextXid,
    8971              :                                       checkPoint.nextXid))
    8972            0 :             TransamVariables->nextXid = checkPoint.nextXid;
    8973          701 :         LWLockRelease(XidGenLock);
    8974              : 
    8975              :         /*
    8976              :          * We ignore the nextOid counter in an ONLINE checkpoint, preferring
    8977              :          * to track OID assignment through XLOG_NEXTOID records.  The nextOid
    8978              :          * counter is from the start of the checkpoint and might well be stale
    8979              :          * compared to later XLOG_NEXTOID records.  We could try to take the
    8980              :          * maximum of the nextOid counter and our latest value, but since
    8981              :          * there's no particular guarantee about the speed with which the OID
    8982              :          * counter wraps around, that's a risky thing to do.  In any case,
    8983              :          * users of the nextOid counter are required to avoid assignment of
    8984              :          * duplicates, so that a somewhat out-of-date value should be safe.
    8985              :          */
    8986              : 
    8987              :         /* Handle multixact */
    8988          701 :         MultiXactAdvanceNextMXact(checkPoint.nextMulti,
    8989              :                                   checkPoint.nextMultiOffset);
    8990              : 
    8991              :         /*
    8992              :          * NB: This may perform multixact truncation when replaying WAL
    8993              :          * generated by an older primary.
    8994              :          */
    8995          701 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
    8996              :                                checkPoint.oldestMultiDB);
    8997          701 :         if (TransactionIdPrecedes(TransamVariables->oldestXid,
    8998              :                                   checkPoint.oldestXid))
    8999            0 :             SetTransactionIdLimit(checkPoint.oldestXid,
    9000              :                                   checkPoint.oldestXidDB);
    9001              :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    9002          701 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9003          701 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    9004          701 :         old_state = ControlFile->data_checksum_version;
    9005          701 :         ControlFile->data_checksum_version = checkPoint.dataChecksumState;
    9006          701 :         LWLockRelease(ControlFileLock);
    9007              : 
    9008              :         /* TLI should not change in an on-line checkpoint */
    9009          701 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    9010          701 :         if (checkPoint.ThisTimeLineID != replayTLI)
    9011            0 :             ereport(PANIC,
    9012              :                     (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
    9013              :                             checkPoint.ThisTimeLineID, replayTLI)));
    9014              : 
    9015          701 :         RecoveryRestartPoint(&checkPoint, record);
    9016              : 
    9017              :         /*
    9018              :          * If the data checksum state change we need to emit a barrier.
    9019              :          */
    9020          701 :         SpinLockAcquire(&XLogCtl->info_lck);
    9021          701 :         XLogCtl->data_checksum_version = checkPoint.dataChecksumState;
    9022          701 :         if (checkPoint.dataChecksumState != old_state)
    9023            3 :             new_state = true;
    9024          701 :         SpinLockRelease(&XLogCtl->info_lck);
    9025              : 
    9026          701 :         if (new_state)
    9027            3 :             EmitAndWaitDataChecksumsBarrier(checkPoint.dataChecksumState);
    9028              : 
    9029              :         /*
    9030              :          * After replaying a checkpoint record, free all smgr objects.
    9031              :          * Otherwise we would never do so for dropped relations, as the
    9032              :          * startup does not process shared invalidation messages or call
    9033              :          * AtEOXact_SMgr().
    9034              :          */
    9035          701 :         smgrdestroyall();
    9036              :     }
    9037       113558 :     else if (info == XLOG_OVERWRITE_CONTRECORD)
    9038              :     {
    9039              :         /* nothing to do here, handled in xlogrecovery_redo() */
    9040              :     }
    9041       113557 :     else if (info == XLOG_END_OF_RECOVERY)
    9042              :     {
    9043              :         xl_end_of_recovery xlrec;
    9044              :         TimeLineID  replayTLI;
    9045              : 
    9046           11 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
    9047              : 
    9048              :         /*
    9049              :          * For Hot Standby, we could treat this like a Shutdown Checkpoint,
    9050              :          * but this case is rarer and harder to test, so the benefit doesn't
    9051              :          * outweigh the potential extra cost of maintenance.
    9052              :          */
    9053              : 
    9054              :         /*
    9055              :          * We should've already switched to the new TLI before replaying this
    9056              :          * record.
    9057              :          */
    9058           11 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    9059           11 :         if (xlrec.ThisTimeLineID != replayTLI)
    9060            0 :             ereport(PANIC,
    9061              :                     (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
    9062              :                             xlrec.ThisTimeLineID, replayTLI)));
    9063              :     }
    9064       113546 :     else if (info == XLOG_NOOP)
    9065              :     {
    9066              :         /* nothing to do here */
    9067              :     }
    9068       113546 :     else if (info == XLOG_SWITCH)
    9069              :     {
    9070              :         /* nothing to do here */
    9071              :     }
    9072       113089 :     else if (info == XLOG_RESTORE_POINT)
    9073              :     {
    9074              :         /* nothing to do here, handled in xlogrecovery.c */
    9075              :     }
    9076       113084 :     else if (info == XLOG_ASSIGN_LSN)
    9077              :     {
    9078              :         /* nothing to do here, see XLogGetFakeLSN() */
    9079              :     }
    9080        50757 :     else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
    9081              :     {
    9082              :         /*
    9083              :          * XLOG_FPI records contain nothing else but one or more block
    9084              :          * references. Every block reference must include a full-page image
    9085              :          * even if full_page_writes was disabled when the record was generated
    9086              :          * - otherwise there would be no point in this record.
    9087              :          *
    9088              :          * XLOG_FPI_FOR_HINT records are generated when a page needs to be
    9089              :          * WAL-logged because of a hint bit update. They are only generated
    9090              :          * when checksums and/or wal_log_hints are enabled. They may include
    9091              :          * no full-page images if full_page_writes was disabled when they were
    9092              :          * generated. In this case there is nothing to do here.
    9093              :          *
    9094              :          * No recovery conflicts are generated by these generic records - if a
    9095              :          * resource manager needs to generate conflicts, it has to define a
    9096              :          * separate WAL record type and redo routine.
    9097              :          */
    9098       104675 :         for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
    9099              :         {
    9100              :             Buffer      buffer;
    9101              : 
    9102        54774 :             if (!XLogRecHasBlockImage(record, block_id))
    9103              :             {
    9104           66 :                 if (info == XLOG_FPI)
    9105            0 :                     elog(ERROR, "XLOG_FPI record did not contain a full-page image");
    9106           66 :                 continue;
    9107              :             }
    9108              : 
    9109        54708 :             if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
    9110            0 :                 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
    9111        54708 :             UnlockReleaseBuffer(buffer);
    9112              :         }
    9113              :     }
    9114          856 :     else if (info == XLOG_BACKUP_END)
    9115              :     {
    9116              :         /* nothing to do here, handled in xlogrecovery_redo() */
    9117              :     }
    9118          759 :     else if (info == XLOG_PARAMETER_CHANGE)
    9119              :     {
    9120              :         xl_parameter_change xlrec;
    9121              : 
    9122              :         /* Update our copy of the parameters in pg_control */
    9123           39 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
    9124              : 
    9125           39 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9126           39 :         ControlFile->MaxConnections = xlrec.MaxConnections;
    9127           39 :         ControlFile->max_worker_processes = xlrec.max_worker_processes;
    9128           39 :         ControlFile->max_wal_senders = xlrec.max_wal_senders;
    9129           39 :         ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
    9130           39 :         ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
    9131           39 :         ControlFile->wal_level = xlrec.wal_level;
    9132           39 :         ControlFile->wal_log_hints = xlrec.wal_log_hints;
    9133              : 
    9134              :         /*
    9135              :          * Update minRecoveryPoint to ensure that if recovery is aborted, we
    9136              :          * recover back up to this point before allowing hot standby again.
    9137              :          * This is important if the max_* settings are decreased, to ensure
    9138              :          * you don't run queries against the WAL preceding the change. The
    9139              :          * local copies cannot be updated as long as crash recovery is
    9140              :          * happening and we expect all the WAL to be replayed.
    9141              :          */
    9142           39 :         if (InArchiveRecovery)
    9143              :         {
    9144           24 :             LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    9145           24 :             LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    9146              :         }
    9147           39 :         if (XLogRecPtrIsValid(LocalMinRecoveryPoint) && LocalMinRecoveryPoint < lsn)
    9148              :         {
    9149              :             TimeLineID  replayTLI;
    9150              : 
    9151           12 :             (void) GetCurrentReplayRecPtr(&replayTLI);
    9152           12 :             ControlFile->minRecoveryPoint = lsn;
    9153           12 :             ControlFile->minRecoveryPointTLI = replayTLI;
    9154              :         }
    9155              : 
    9156           39 :         CommitTsParameterChange(xlrec.track_commit_timestamp,
    9157           39 :                                 ControlFile->track_commit_timestamp);
    9158           39 :         ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
    9159              : 
    9160           39 :         UpdateControlFile();
    9161           39 :         LWLockRelease(ControlFileLock);
    9162              : 
    9163              :         /* Check to see if any parameter change gives a problem on recovery */
    9164           39 :         CheckRequiredParameterValues();
    9165              :     }
    9166          720 :     else if (info == XLOG_FPW_CHANGE)
    9167              :     {
    9168              :         bool        fpw;
    9169              : 
    9170            0 :         memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
    9171              : 
    9172              :         /*
    9173              :          * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
    9174              :          * do_pg_backup_start() and do_pg_backup_stop() can check whether
    9175              :          * full_page_writes has been disabled during online backup.
    9176              :          */
    9177            0 :         if (!fpw)
    9178              :         {
    9179            0 :             SpinLockAcquire(&XLogCtl->info_lck);
    9180            0 :             if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
    9181            0 :                 XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
    9182            0 :             SpinLockRelease(&XLogCtl->info_lck);
    9183              :         }
    9184              : 
    9185              :         /* Keep track of full_page_writes */
    9186            0 :         lastFullPageWrites = fpw;
    9187              :     }
    9188          720 :     else if (info == XLOG_CHECKPOINT_REDO)
    9189              :     {
    9190              :         xl_checkpoint_redo redo_rec;
    9191          702 :         bool        new_state = false;
    9192              : 
    9193          702 :         memcpy(&redo_rec, XLogRecGetData(record), sizeof(xl_checkpoint_redo));
    9194              : 
    9195          702 :         SpinLockAcquire(&XLogCtl->info_lck);
    9196          702 :         XLogCtl->data_checksum_version = redo_rec.data_checksum_version;
    9197          702 :         if (redo_rec.data_checksum_version != ControlFile->data_checksum_version)
    9198            3 :             new_state = true;
    9199          702 :         SpinLockRelease(&XLogCtl->info_lck);
    9200              : 
    9201          702 :         if (new_state)
    9202            3 :             EmitAndWaitDataChecksumsBarrier(redo_rec.data_checksum_version);
    9203              :     }
    9204           18 :     else if (info == XLOG_LOGICAL_DECODING_STATUS_CHANGE)
    9205              :     {
    9206              :         bool        status;
    9207              : 
    9208           18 :         memcpy(&status, XLogRecGetData(record), sizeof(bool));
    9209              : 
    9210              :         /*
    9211              :          * We need to toggle the logical decoding status and update the
    9212              :          * XLogLogicalInfo cache of processes synchronously because
    9213              :          * XLogLogicalInfoActive() is used even during read-only queries
    9214              :          * (e.g., via RelationIsAccessibleInLogicalDecoding()). In the
    9215              :          * 'disable' case, it is safe to invalidate existing slots after
    9216              :          * disabling logical decoding because logical decoding cannot process
    9217              :          * subsequent WAL records, which may not contain logical information.
    9218              :          */
    9219           18 :         if (status)
    9220            9 :             EnableLogicalDecoding();
    9221              :         else
    9222            9 :             DisableLogicalDecoding();
    9223              : 
    9224           18 :         elog(DEBUG1, "update logical decoding status to %d during recovery",
    9225              :              status);
    9226              : 
    9227           18 :         if (InRecovery && InHotStandby)
    9228              :         {
    9229           16 :             if (!status)
    9230              :             {
    9231              :                 /*
    9232              :                  * Invalidate logical slots if we are in hot standby and the
    9233              :                  * primary disabled logical decoding.
    9234              :                  */
    9235            9 :                 InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_LEVEL,
    9236              :                                                    0, InvalidOid,
    9237              :                                                    InvalidTransactionId);
    9238              :             }
    9239            7 :             else if (sync_replication_slots)
    9240              :             {
    9241              :                 /*
    9242              :                  * Signal the postmaster to launch the slotsync worker.
    9243              :                  *
    9244              :                  * XXX: For simplicity, we keep the slotsync worker running
    9245              :                  * even after logical decoding is disabled. A future
    9246              :                  * improvement can consider starting and stopping the worker
    9247              :                  * based on logical decoding status change.
    9248              :                  */
    9249            0 :                 kill(PostmasterPid, SIGUSR1);
    9250              :             }
    9251              :         }
    9252              :     }
    9253       114396 : }
    9254              : 
    9255              : void
    9256            4 : xlog2_redo(XLogReaderState *record)
    9257              : {
    9258            4 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    9259              : 
    9260            4 :     if (info == XLOG2_CHECKSUMS)
    9261              :     {
    9262              :         xl_checksum_state state;
    9263              : 
    9264            4 :         memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state));
    9265              : 
    9266            4 :         SpinLockAcquire(&XLogCtl->info_lck);
    9267            4 :         XLogCtl->data_checksum_version = state.new_checksum_state;
    9268            4 :         SpinLockRelease(&XLogCtl->info_lck);
    9269              : 
    9270              :         /*
    9271              :          * Block on a procsignalbarrier to await all processes having seen the
    9272              :          * change to checksum status. Once the barrier has been passed we can
    9273              :          * initiate the corresponding processing.
    9274              :          */
    9275            4 :         EmitAndWaitDataChecksumsBarrier(state.new_checksum_state);
    9276              :     }
    9277            4 : }
    9278              : 
    9279              : /*
    9280              :  * Return the extra open flags used for opening a file, depending on the
    9281              :  * value of the GUCs wal_sync_method, fsync and debug_io_direct.
    9282              :  */
    9283              : static int
    9284        17816 : get_sync_bit(int method)
    9285              : {
    9286        17816 :     int         o_direct_flag = 0;
    9287              : 
    9288              :     /*
    9289              :      * Use O_DIRECT if requested, except in walreceiver process.  The WAL
    9290              :      * written by walreceiver is normally read by the startup process soon
    9291              :      * after it's written.  Also, walreceiver performs unaligned writes, which
    9292              :      * don't work with O_DIRECT, so it is required for correctness too.
    9293              :      */
    9294        17816 :     if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
    9295            9 :         o_direct_flag = PG_O_DIRECT;
    9296              : 
    9297              :     /* If fsync is disabled, never open in sync mode */
    9298        17816 :     if (!enableFsync)
    9299        17816 :         return o_direct_flag;
    9300              : 
    9301            0 :     switch (method)
    9302              :     {
    9303              :             /*
    9304              :              * enum values for all sync options are defined even if they are
    9305              :              * not supported on the current platform.  But if not, they are
    9306              :              * not included in the enum option array, and therefore will never
    9307              :              * be seen here.
    9308              :              */
    9309            0 :         case WAL_SYNC_METHOD_FSYNC:
    9310              :         case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
    9311              :         case WAL_SYNC_METHOD_FDATASYNC:
    9312            0 :             return o_direct_flag;
    9313              : #ifdef O_SYNC
    9314            0 :         case WAL_SYNC_METHOD_OPEN:
    9315            0 :             return O_SYNC | o_direct_flag;
    9316              : #endif
    9317              : #ifdef O_DSYNC
    9318            0 :         case WAL_SYNC_METHOD_OPEN_DSYNC:
    9319            0 :             return O_DSYNC | o_direct_flag;
    9320              : #endif
    9321            0 :         default:
    9322              :             /* can't happen (unless we are out of sync with option array) */
    9323            0 :             elog(ERROR, "unrecognized \"wal_sync_method\": %d", method);
    9324              :             return 0;           /* silence warning */
    9325              :     }
    9326              : }
    9327              : 
    9328              : /*
    9329              :  * GUC support
    9330              :  */
    9331              : void
    9332         1275 : assign_wal_sync_method(int new_wal_sync_method, void *extra)
    9333              : {
    9334         1275 :     if (wal_sync_method != new_wal_sync_method)
    9335              :     {
    9336              :         /*
    9337              :          * To ensure that no blocks escape unsynced, force an fsync on the
    9338              :          * currently open log segment (if any).  Also, if the open flag is
    9339              :          * changing, close the log file so it will be reopened (with new flag
    9340              :          * bit) at next use.
    9341              :          */
    9342            0 :         if (openLogFile >= 0)
    9343              :         {
    9344            0 :             pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
    9345            0 :             if (pg_fsync(openLogFile) != 0)
    9346              :             {
    9347              :                 char        xlogfname[MAXFNAMELEN];
    9348              :                 int         save_errno;
    9349              : 
    9350            0 :                 save_errno = errno;
    9351            0 :                 XLogFileName(xlogfname, openLogTLI, openLogSegNo,
    9352              :                              wal_segment_size);
    9353            0 :                 errno = save_errno;
    9354            0 :                 ereport(PANIC,
    9355              :                         (errcode_for_file_access(),
    9356              :                          errmsg("could not fsync file \"%s\": %m", xlogfname)));
    9357              :             }
    9358              : 
    9359            0 :             pgstat_report_wait_end();
    9360            0 :             if (get_sync_bit(wal_sync_method) != get_sync_bit(new_wal_sync_method))
    9361            0 :                 XLogFileClose();
    9362              :         }
    9363              :     }
    9364         1275 : }
    9365              : 
    9366              : 
    9367              : /*
    9368              :  * Issue appropriate kind of fsync (if any) for an XLOG output file.
    9369              :  *
    9370              :  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
    9371              :  * 'segno' is for error reporting purposes.
    9372              :  */
    9373              : void
    9374       208090 : issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
    9375              : {
    9376       208090 :     char       *msg = NULL;
    9377              :     instr_time  start;
    9378              : 
    9379              :     Assert(tli != 0);
    9380              : 
    9381              :     /*
    9382              :      * Quick exit if fsync is disabled or write() has already synced the WAL
    9383              :      * file.
    9384              :      */
    9385       208090 :     if (!enableFsync ||
    9386            0 :         wal_sync_method == WAL_SYNC_METHOD_OPEN ||
    9387            0 :         wal_sync_method == WAL_SYNC_METHOD_OPEN_DSYNC)
    9388       208090 :         return;
    9389              : 
    9390              :     /*
    9391              :      * Measure I/O timing to sync the WAL file for pg_stat_io.
    9392              :      */
    9393            0 :     start = pgstat_prepare_io_time(track_wal_io_timing);
    9394              : 
    9395            0 :     pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
    9396            0 :     switch (wal_sync_method)
    9397              :     {
    9398            0 :         case WAL_SYNC_METHOD_FSYNC:
    9399            0 :             if (pg_fsync_no_writethrough(fd) != 0)
    9400            0 :                 msg = _("could not fsync file \"%s\": %m");
    9401            0 :             break;
    9402              : #ifdef HAVE_FSYNC_WRITETHROUGH
    9403              :         case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
    9404              :             if (pg_fsync_writethrough(fd) != 0)
    9405              :                 msg = _("could not fsync write-through file \"%s\": %m");
    9406              :             break;
    9407              : #endif
    9408            0 :         case WAL_SYNC_METHOD_FDATASYNC:
    9409            0 :             if (pg_fdatasync(fd) != 0)
    9410            0 :                 msg = _("could not fdatasync file \"%s\": %m");
    9411            0 :             break;
    9412            0 :         case WAL_SYNC_METHOD_OPEN:
    9413              :         case WAL_SYNC_METHOD_OPEN_DSYNC:
    9414              :             /* not reachable */
    9415              :             Assert(false);
    9416            0 :             break;
    9417            0 :         default:
    9418            0 :             ereport(PANIC,
    9419              :                     errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    9420              :                     errmsg_internal("unrecognized \"wal_sync_method\": %d", wal_sync_method));
    9421              :             break;
    9422              :     }
    9423              : 
    9424              :     /* PANIC if failed to fsync */
    9425            0 :     if (msg)
    9426              :     {
    9427              :         char        xlogfname[MAXFNAMELEN];
    9428            0 :         int         save_errno = errno;
    9429              : 
    9430            0 :         XLogFileName(xlogfname, tli, segno, wal_segment_size);
    9431            0 :         errno = save_errno;
    9432            0 :         ereport(PANIC,
    9433              :                 (errcode_for_file_access(),
    9434              :                  errmsg(msg, xlogfname)));
    9435              :     }
    9436              : 
    9437            0 :     pgstat_report_wait_end();
    9438              : 
    9439            0 :     pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC,
    9440              :                             start, 1, 0);
    9441              : }
    9442              : 
    9443              : /*
    9444              :  * do_pg_backup_start is the workhorse of the user-visible pg_backup_start()
    9445              :  * function. It creates the necessary starting checkpoint and constructs the
    9446              :  * backup state and tablespace map.
    9447              :  *
    9448              :  * Input parameters are "state" (the backup state), "fast" (if true, we do
    9449              :  * the checkpoint in fast mode), and "tablespaces" (if non-NULL, indicates a
    9450              :  * list of tablespaceinfo structs describing the cluster's tablespaces.).
    9451              :  *
    9452              :  * The tablespace map contents are appended to passed-in parameter
    9453              :  * tablespace_map and the caller is responsible for including it in the backup
    9454              :  * archive as 'tablespace_map'. The tablespace_map file is required mainly for
    9455              :  * tar format in windows as native windows utilities are not able to create
    9456              :  * symlinks while extracting files from tar. However for consistency and
    9457              :  * platform-independence, we do it the same way everywhere.
    9458              :  *
    9459              :  * It fills in "state" with the information required for the backup, such
    9460              :  * as the minimum WAL location that must be present to restore from this
    9461              :  * backup (starttli) and the corresponding timeline ID (starttli).
    9462              :  *
    9463              :  * Every successfully started backup must be stopped by calling
    9464              :  * do_pg_backup_stop() or do_pg_abort_backup(). There can be many
    9465              :  * backups active at the same time.
    9466              :  *
    9467              :  * It is the responsibility of the caller of this function to verify the
    9468              :  * permissions of the calling user!
    9469              :  */
    9470              : void
    9471          177 : do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces,
    9472              :                    BackupState *state, StringInfo tblspcmapfile)
    9473              : {
    9474              :     bool        backup_started_in_recovery;
    9475              : 
    9476              :     Assert(state != NULL);
    9477          177 :     backup_started_in_recovery = RecoveryInProgress();
    9478              : 
    9479              :     /*
    9480              :      * During recovery, we don't need to check WAL level. Because, if WAL
    9481              :      * level is not sufficient, it's impossible to get here during recovery.
    9482              :      */
    9483          177 :     if (!backup_started_in_recovery && !XLogIsNeeded())
    9484            0 :         ereport(ERROR,
    9485              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9486              :                  errmsg("WAL level not sufficient for making an online backup"),
    9487              :                  errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
    9488              : 
    9489          177 :     if (strlen(backupidstr) > MAXPGPATH)
    9490            1 :         ereport(ERROR,
    9491              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    9492              :                  errmsg("backup label too long (max %d bytes)",
    9493              :                         MAXPGPATH)));
    9494              : 
    9495          176 :     strlcpy(state->name, backupidstr, sizeof(state->name));
    9496              : 
    9497              :     /*
    9498              :      * Mark backup active in shared memory.  We must do full-page WAL writes
    9499              :      * during an on-line backup even if not doing so at other times, because
    9500              :      * it's quite possible for the backup dump to obtain a "torn" (partially
    9501              :      * written) copy of a database page if it reads the page concurrently with
    9502              :      * our write to the same page.  This can be fixed as long as the first
    9503              :      * write to the page in the WAL sequence is a full-page write. Hence, we
    9504              :      * increment runningBackups then force a CHECKPOINT, to ensure there are
    9505              :      * no dirty pages in shared memory that might get dumped while the backup
    9506              :      * is in progress without having a corresponding WAL record.  (Once the
    9507              :      * backup is complete, we need not force full-page writes anymore, since
    9508              :      * we expect that any pages not modified during the backup interval must
    9509              :      * have been correctly captured by the backup.)
    9510              :      *
    9511              :      * Note that forcing full-page writes has no effect during an online
    9512              :      * backup from the standby.
    9513              :      *
    9514              :      * We must hold all the insertion locks to change the value of
    9515              :      * runningBackups, to ensure adequate interlocking against
    9516              :      * XLogInsertRecord().
    9517              :      */
    9518          176 :     WALInsertLockAcquireExclusive();
    9519          176 :     XLogCtl->Insert.runningBackups++;
    9520          176 :     WALInsertLockRelease();
    9521              : 
    9522              :     /*
    9523              :      * Ensure we decrement runningBackups if we fail below. NB -- for this to
    9524              :      * work correctly, it is critical that sessionBackupState is only updated
    9525              :      * after this block is over.
    9526              :      */
    9527          176 :     PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(true));
    9528              :     {
    9529          176 :         bool        gotUniqueStartpoint = false;
    9530              :         DIR        *tblspcdir;
    9531              :         struct dirent *de;
    9532              :         tablespaceinfo *ti;
    9533              :         int         datadirpathlen;
    9534              : 
    9535              :         /*
    9536              :          * Force an XLOG file switch before the checkpoint, to ensure that the
    9537              :          * WAL segment the checkpoint is written to doesn't contain pages with
    9538              :          * old timeline IDs.  That would otherwise happen if you called
    9539              :          * pg_backup_start() right after restoring from a PITR archive: the
    9540              :          * first WAL segment containing the startup checkpoint has pages in
    9541              :          * the beginning with the old timeline ID.  That can cause trouble at
    9542              :          * recovery: we won't have a history file covering the old timeline if
    9543              :          * pg_wal directory was not included in the base backup and the WAL
    9544              :          * archive was cleared too before starting the backup.
    9545              :          *
    9546              :          * During recovery, we skip forcing XLOG file switch, which means that
    9547              :          * the backup taken during recovery is not available for the special
    9548              :          * recovery case described above.
    9549              :          */
    9550          176 :         if (!backup_started_in_recovery)
    9551          168 :             RequestXLogSwitch(false);
    9552              : 
    9553              :         do
    9554              :         {
    9555              :             bool        checkpointfpw;
    9556              : 
    9557              :             /*
    9558              :              * Force a CHECKPOINT.  Aside from being necessary to prevent torn
    9559              :              * page problems, this guarantees that two successive backup runs
    9560              :              * will have different checkpoint positions and hence different
    9561              :              * history file names, even if nothing happened in between.
    9562              :              *
    9563              :              * During recovery, establish a restartpoint if possible. We use
    9564              :              * the last restartpoint as the backup starting checkpoint. This
    9565              :              * means that two successive backup runs can have same checkpoint
    9566              :              * positions.
    9567              :              *
    9568              :              * Since the fact that we are executing do_pg_backup_start()
    9569              :              * during recovery means that checkpointer is running, we can use
    9570              :              * RequestCheckpoint() to establish a restartpoint.
    9571              :              *
    9572              :              * We use CHECKPOINT_FAST only if requested by user (via passing
    9573              :              * fast = true).  Otherwise this can take awhile.
    9574              :              */
    9575          176 :             RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
    9576              :                               (fast ? CHECKPOINT_FAST : 0));
    9577              : 
    9578              :             /*
    9579              :              * Now we need to fetch the checkpoint record location, and also
    9580              :              * its REDO pointer.  The oldest point in WAL that would be needed
    9581              :              * to restore starting from the checkpoint is precisely the REDO
    9582              :              * pointer.
    9583              :              */
    9584          176 :             LWLockAcquire(ControlFileLock, LW_SHARED);
    9585          176 :             state->checkpointloc = ControlFile->checkPoint;
    9586          176 :             state->startpoint = ControlFile->checkPointCopy.redo;
    9587          176 :             state->starttli = ControlFile->checkPointCopy.ThisTimeLineID;
    9588          176 :             checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
    9589          176 :             LWLockRelease(ControlFileLock);
    9590              : 
    9591          176 :             if (backup_started_in_recovery)
    9592              :             {
    9593              :                 XLogRecPtr  recptr;
    9594              : 
    9595              :                 /*
    9596              :                  * Check to see if all WAL replayed during online backup
    9597              :                  * (i.e., since last restartpoint used as backup starting
    9598              :                  * checkpoint) contain full-page writes.
    9599              :                  */
    9600            8 :                 SpinLockAcquire(&XLogCtl->info_lck);
    9601            8 :                 recptr = XLogCtl->lastFpwDisableRecPtr;
    9602            8 :                 SpinLockRelease(&XLogCtl->info_lck);
    9603              : 
    9604            8 :                 if (!checkpointfpw || state->startpoint <= recptr)
    9605            0 :                     ereport(ERROR,
    9606              :                             (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9607              :                              errmsg("WAL generated with \"full_page_writes=off\" was replayed "
    9608              :                                     "since last restartpoint"),
    9609              :                              errhint("This means that the backup being taken on the standby "
    9610              :                                      "is corrupt and should not be used. "
    9611              :                                      "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
    9612              :                                      "and then try an online backup again.")));
    9613              : 
    9614              :                 /*
    9615              :                  * During recovery, since we don't use the end-of-backup WAL
    9616              :                  * record and don't write the backup history file, the
    9617              :                  * starting WAL location doesn't need to be unique. This means
    9618              :                  * that two base backups started at the same time might use
    9619              :                  * the same checkpoint as starting locations.
    9620              :                  */
    9621            8 :                 gotUniqueStartpoint = true;
    9622              :             }
    9623              : 
    9624              :             /*
    9625              :              * If two base backups are started at the same time (in WAL sender
    9626              :              * processes), we need to make sure that they use different
    9627              :              * checkpoints as starting locations, because we use the starting
    9628              :              * WAL location as a unique identifier for the base backup in the
    9629              :              * end-of-backup WAL record and when we write the backup history
    9630              :              * file. Perhaps it would be better generate a separate unique ID
    9631              :              * for each backup instead of forcing another checkpoint, but
    9632              :              * taking a checkpoint right after another is not that expensive
    9633              :              * either because only few buffers have been dirtied yet.
    9634              :              */
    9635          176 :             WALInsertLockAcquireExclusive();
    9636          176 :             if (XLogCtl->Insert.lastBackupStart < state->startpoint)
    9637              :             {
    9638          176 :                 XLogCtl->Insert.lastBackupStart = state->startpoint;
    9639          176 :                 gotUniqueStartpoint = true;
    9640              :             }
    9641          176 :             WALInsertLockRelease();
    9642          176 :         } while (!gotUniqueStartpoint);
    9643              : 
    9644              :         /*
    9645              :          * Construct tablespace_map file.
    9646              :          */
    9647          176 :         datadirpathlen = strlen(DataDir);
    9648              : 
    9649              :         /* Collect information about all tablespaces */
    9650          176 :         tblspcdir = AllocateDir(PG_TBLSPC_DIR);
    9651          565 :         while ((de = ReadDir(tblspcdir, PG_TBLSPC_DIR)) != NULL)
    9652              :         {
    9653              :             char        fullpath[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
    9654              :             char        linkpath[MAXPGPATH];
    9655          389 :             char       *relpath = NULL;
    9656              :             char       *s;
    9657              :             PGFileType  de_type;
    9658              :             char       *badp;
    9659              :             Oid         tsoid;
    9660              : 
    9661              :             /*
    9662              :              * Try to parse the directory name as an unsigned integer.
    9663              :              *
    9664              :              * Tablespace directories should be positive integers that can be
    9665              :              * represented in 32 bits, with no leading zeroes or trailing
    9666              :              * garbage. If we come across a name that doesn't meet those
    9667              :              * criteria, skip it.
    9668              :              */
    9669          389 :             if (de->d_name[0] < '1' || de->d_name[1] > '9')
    9670          352 :                 continue;
    9671           37 :             errno = 0;
    9672           37 :             tsoid = strtoul(de->d_name, &badp, 10);
    9673           37 :             if (*badp != '\0' || errno == EINVAL || errno == ERANGE)
    9674            0 :                 continue;
    9675              : 
    9676           37 :             snprintf(fullpath, sizeof(fullpath), "%s/%s", PG_TBLSPC_DIR, de->d_name);
    9677              : 
    9678           37 :             de_type = get_dirent_type(fullpath, de, false, ERROR);
    9679              : 
    9680           37 :             if (de_type == PGFILETYPE_LNK)
    9681              :             {
    9682              :                 StringInfoData escapedpath;
    9683              :                 int         rllen;
    9684              : 
    9685           23 :                 rllen = readlink(fullpath, linkpath, sizeof(linkpath));
    9686           23 :                 if (rllen < 0)
    9687              :                 {
    9688            0 :                     ereport(WARNING,
    9689              :                             (errmsg("could not read symbolic link \"%s\": %m",
    9690              :                                     fullpath)));
    9691            0 :                     continue;
    9692              :                 }
    9693           23 :                 else if (rllen >= sizeof(linkpath))
    9694              :                 {
    9695            0 :                     ereport(WARNING,
    9696              :                             (errmsg("symbolic link \"%s\" target is too long",
    9697              :                                     fullpath)));
    9698            0 :                     continue;
    9699              :                 }
    9700           23 :                 linkpath[rllen] = '\0';
    9701              : 
    9702              :                 /*
    9703              :                  * Relpath holds the relative path of the tablespace directory
    9704              :                  * when it's located within PGDATA, or NULL if it's located
    9705              :                  * elsewhere.
    9706              :                  */
    9707           23 :                 if (rllen > datadirpathlen &&
    9708            1 :                     strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
    9709            0 :                     IS_DIR_SEP(linkpath[datadirpathlen]))
    9710            0 :                     relpath = pstrdup(linkpath + datadirpathlen + 1);
    9711              : 
    9712              :                 /*
    9713              :                  * Add a backslash-escaped version of the link path to the
    9714              :                  * tablespace map file.
    9715              :                  */
    9716           23 :                 initStringInfo(&escapedpath);
    9717          562 :                 for (s = linkpath; *s; s++)
    9718              :                 {
    9719          539 :                     if (*s == '\n' || *s == '\r' || *s == '\\')
    9720            0 :                         appendStringInfoChar(&escapedpath, '\\');
    9721          539 :                     appendStringInfoChar(&escapedpath, *s);
    9722              :                 }
    9723           23 :                 appendStringInfo(tblspcmapfile, "%s %s\n",
    9724           23 :                                  de->d_name, escapedpath.data);
    9725           23 :                 pfree(escapedpath.data);
    9726              :             }
    9727           14 :             else if (de_type == PGFILETYPE_DIR)
    9728              :             {
    9729              :                 /*
    9730              :                  * It's possible to use allow_in_place_tablespaces to create
    9731              :                  * directories directly under pg_tblspc, for testing purposes
    9732              :                  * only.
    9733              :                  *
    9734              :                  * In this case, we store a relative path rather than an
    9735              :                  * absolute path into the tablespaceinfo.
    9736              :                  */
    9737           14 :                 snprintf(linkpath, sizeof(linkpath), "%s/%s",
    9738           14 :                          PG_TBLSPC_DIR, de->d_name);
    9739           14 :                 relpath = pstrdup(linkpath);
    9740              :             }
    9741              :             else
    9742              :             {
    9743              :                 /* Skip any other file type that appears here. */
    9744            0 :                 continue;
    9745              :             }
    9746              : 
    9747           37 :             ti = palloc_object(tablespaceinfo);
    9748           37 :             ti->oid = tsoid;
    9749           37 :             ti->path = pstrdup(linkpath);
    9750           37 :             ti->rpath = relpath;
    9751           37 :             ti->size = -1;
    9752              : 
    9753           37 :             if (tablespaces)
    9754           37 :                 *tablespaces = lappend(*tablespaces, ti);
    9755              :         }
    9756          176 :         FreeDir(tblspcdir);
    9757              : 
    9758          176 :         state->starttime = (pg_time_t) time(NULL);
    9759              :     }
    9760          176 :     PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(true));
    9761              : 
    9762          176 :     state->started_in_recovery = backup_started_in_recovery;
    9763              : 
    9764              :     /*
    9765              :      * Mark that the start phase has correctly finished for the backup.
    9766              :      */
    9767          176 :     sessionBackupState = SESSION_BACKUP_RUNNING;
    9768          176 : }
    9769              : 
    9770              : /*
    9771              :  * Utility routine to fetch the session-level status of a backup running.
    9772              :  */
    9773              : SessionBackupState
    9774          198 : get_backup_status(void)
    9775              : {
    9776          198 :     return sessionBackupState;
    9777              : }
    9778              : 
    9779              : /*
    9780              :  * do_pg_backup_stop
    9781              :  *
    9782              :  * Utility function called at the end of an online backup.  It creates history
    9783              :  * file (if required), resets sessionBackupState and so on.  It can optionally
    9784              :  * wait for WAL segments to be archived.
    9785              :  *
    9786              :  * "state" is filled with the information necessary to restore from this
    9787              :  * backup with its stop LSN (stoppoint), its timeline ID (stoptli), etc.
    9788              :  *
    9789              :  * It is the responsibility of the caller of this function to verify the
    9790              :  * permissions of the calling user!
    9791              :  */
    9792              : void
    9793          170 : do_pg_backup_stop(BackupState *state, bool waitforarchive)
    9794              : {
    9795          170 :     bool        backup_stopped_in_recovery = false;
    9796              :     char        histfilepath[MAXPGPATH];
    9797              :     char        lastxlogfilename[MAXFNAMELEN];
    9798              :     char        histfilename[MAXFNAMELEN];
    9799              :     XLogSegNo   _logSegNo;
    9800              :     FILE       *fp;
    9801              :     int         seconds_before_warning;
    9802          170 :     int         waits = 0;
    9803          170 :     bool        reported_waiting = false;
    9804              : 
    9805              :     Assert(state != NULL);
    9806              : 
    9807          170 :     backup_stopped_in_recovery = RecoveryInProgress();
    9808              : 
    9809              :     /*
    9810              :      * During recovery, we don't need to check WAL level. Because, if WAL
    9811              :      * level is not sufficient, it's impossible to get here during recovery.
    9812              :      */
    9813          170 :     if (!backup_stopped_in_recovery && !XLogIsNeeded())
    9814            0 :         ereport(ERROR,
    9815              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9816              :                  errmsg("WAL level not sufficient for making an online backup"),
    9817              :                  errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
    9818              : 
    9819              :     /*
    9820              :      * OK to update backup counter and session-level lock.
    9821              :      *
    9822              :      * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them,
    9823              :      * otherwise they can be updated inconsistently, which might cause
    9824              :      * do_pg_abort_backup() to fail.
    9825              :      */
    9826          170 :     WALInsertLockAcquireExclusive();
    9827              : 
    9828              :     /*
    9829              :      * It is expected that each do_pg_backup_start() call is matched by
    9830              :      * exactly one do_pg_backup_stop() call.
    9831              :      */
    9832              :     Assert(XLogCtl->Insert.runningBackups > 0);
    9833          170 :     XLogCtl->Insert.runningBackups--;
    9834              : 
    9835              :     /*
    9836              :      * Clean up session-level lock.
    9837              :      *
    9838              :      * You might think that WALInsertLockRelease() can be called before
    9839              :      * cleaning up session-level lock because session-level lock doesn't need
    9840              :      * to be protected with WAL insertion lock. But since
    9841              :      * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
    9842              :      * cleaned up before it.
    9843              :      */
    9844          170 :     sessionBackupState = SESSION_BACKUP_NONE;
    9845              : 
    9846          170 :     WALInsertLockRelease();
    9847              : 
    9848              :     /*
    9849              :      * If we are taking an online backup from the standby, we confirm that the
    9850              :      * standby has not been promoted during the backup.
    9851              :      */
    9852          170 :     if (state->started_in_recovery && !backup_stopped_in_recovery)
    9853            0 :         ereport(ERROR,
    9854              :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9855              :                  errmsg("the standby was promoted during online backup"),
    9856              :                  errhint("This means that the backup being taken is corrupt "
    9857              :                          "and should not be used. "
    9858              :                          "Try taking another online backup.")));
    9859              : 
    9860              :     /*
    9861              :      * During recovery, we don't write an end-of-backup record. We assume that
    9862              :      * pg_control was backed up last and its minimum recovery point can be
    9863              :      * available as the backup end location. Since we don't have an
    9864              :      * end-of-backup record, we use the pg_control value to check whether
    9865              :      * we've reached the end of backup when starting recovery from this
    9866              :      * backup. We have no way of checking if pg_control wasn't backed up last
    9867              :      * however.
    9868              :      *
    9869              :      * We don't force a switch to new WAL file but it is still possible to
    9870              :      * wait for all the required files to be archived if waitforarchive is
    9871              :      * true. This is okay if we use the backup to start a standby and fetch
    9872              :      * the missing WAL using streaming replication. But in the case of an
    9873              :      * archive recovery, a user should set waitforarchive to true and wait for
    9874              :      * them to be archived to ensure that all the required files are
    9875              :      * available.
    9876              :      *
    9877              :      * We return the current minimum recovery point as the backup end
    9878              :      * location. Note that it can be greater than the exact backup end
    9879              :      * location if the minimum recovery point is updated after the backup of
    9880              :      * pg_control. This is harmless for current uses.
    9881              :      *
    9882              :      * XXX currently a backup history file is for informational and debug
    9883              :      * purposes only. It's not essential for an online backup. Furthermore,
    9884              :      * even if it's created, it will not be archived during recovery because
    9885              :      * an archiver is not invoked. So it doesn't seem worthwhile to write a
    9886              :      * backup history file during recovery.
    9887              :      */
    9888          170 :     if (backup_stopped_in_recovery)
    9889              :     {
    9890              :         XLogRecPtr  recptr;
    9891              : 
    9892              :         /*
    9893              :          * Check to see if all WAL replayed during online backup contain
    9894              :          * full-page writes.
    9895              :          */
    9896            8 :         SpinLockAcquire(&XLogCtl->info_lck);
    9897            8 :         recptr = XLogCtl->lastFpwDisableRecPtr;
    9898            8 :         SpinLockRelease(&XLogCtl->info_lck);
    9899              : 
    9900            8 :         if (state->startpoint <= recptr)
    9901            0 :             ereport(ERROR,
    9902              :                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9903              :                      errmsg("WAL generated with \"full_page_writes=off\" was replayed "
    9904              :                             "during online backup"),
    9905              :                      errhint("This means that the backup being taken on the standby "
    9906              :                              "is corrupt and should not be used. "
    9907              :                              "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
    9908              :                              "and then try an online backup again.")));
    9909              : 
    9910              : 
    9911            8 :         LWLockAcquire(ControlFileLock, LW_SHARED);
    9912            8 :         state->stoppoint = ControlFile->minRecoveryPoint;
    9913            8 :         state->stoptli = ControlFile->minRecoveryPointTLI;
    9914            8 :         LWLockRelease(ControlFileLock);
    9915              :     }
    9916              :     else
    9917              :     {
    9918              :         char       *history_file;
    9919              : 
    9920              :         /*
    9921              :          * Write the backup-end xlog record
    9922              :          */
    9923          162 :         XLogBeginInsert();
    9924          162 :         XLogRegisterData(&state->startpoint,
    9925              :                          sizeof(state->startpoint));
    9926          162 :         state->stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
    9927              : 
    9928              :         /*
    9929              :          * Given that we're not in recovery, InsertTimeLineID is set and can't
    9930              :          * change, so we can read it without a lock.
    9931              :          */
    9932          162 :         state->stoptli = XLogCtl->InsertTimeLineID;
    9933              : 
    9934              :         /*
    9935              :          * Force a switch to a new xlog segment file, so that the backup is
    9936              :          * valid as soon as archiver moves out the current segment file.
    9937              :          */
    9938          162 :         RequestXLogSwitch(false);
    9939              : 
    9940          162 :         state->stoptime = (pg_time_t) time(NULL);
    9941              : 
    9942              :         /*
    9943              :          * Write the backup history file
    9944              :          */
    9945          162 :         XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
    9946          162 :         BackupHistoryFilePath(histfilepath, state->stoptli, _logSegNo,
    9947              :                               state->startpoint, wal_segment_size);
    9948          162 :         fp = AllocateFile(histfilepath, "w");
    9949          162 :         if (!fp)
    9950            0 :             ereport(ERROR,
    9951              :                     (errcode_for_file_access(),
    9952              :                      errmsg("could not create file \"%s\": %m",
    9953              :                             histfilepath)));
    9954              : 
    9955              :         /* Build and save the contents of the backup history file */
    9956          162 :         history_file = build_backup_content(state, true);
    9957          162 :         fprintf(fp, "%s", history_file);
    9958          162 :         pfree(history_file);
    9959              : 
    9960          162 :         if (fflush(fp) || ferror(fp) || FreeFile(fp))
    9961            0 :             ereport(ERROR,
    9962              :                     (errcode_for_file_access(),
    9963              :                      errmsg("could not write file \"%s\": %m",
    9964              :                             histfilepath)));
    9965              : 
    9966              :         /*
    9967              :          * Clean out any no-longer-needed history files.  As a side effect,
    9968              :          * this will post a .ready file for the newly created history file,
    9969              :          * notifying the archiver that history file may be archived
    9970              :          * immediately.
    9971              :          */
    9972          162 :         CleanupBackupHistory();
    9973              :     }
    9974              : 
    9975              :     /*
    9976              :      * If archiving is enabled, wait for all the required WAL files to be
    9977              :      * archived before returning. If archiving isn't enabled, the required WAL
    9978              :      * needs to be transported via streaming replication (hopefully with
    9979              :      * wal_keep_size set high enough), or some more exotic mechanism like
    9980              :      * polling and copying files from pg_wal with script. We have no knowledge
    9981              :      * of those mechanisms, so it's up to the user to ensure that he gets all
    9982              :      * the required WAL.
    9983              :      *
    9984              :      * We wait until both the last WAL file filled during backup and the
    9985              :      * history file have been archived, and assume that the alphabetic sorting
    9986              :      * property of the WAL files ensures any earlier WAL files are safely
    9987              :      * archived as well.
    9988              :      *
    9989              :      * We wait forever, since archive_command is supposed to work and we
    9990              :      * assume the admin wanted his backup to work completely. If you don't
    9991              :      * wish to wait, then either waitforarchive should be passed in as false,
    9992              :      * or you can set statement_timeout.  Also, some notices are issued to
    9993              :      * clue in anyone who might be doing this interactively.
    9994              :      */
    9995              : 
    9996          170 :     if (waitforarchive &&
    9997           11 :         ((!backup_stopped_in_recovery && XLogArchivingActive()) ||
    9998            1 :          (backup_stopped_in_recovery && XLogArchivingAlways())))
    9999              :     {
   10000            5 :         XLByteToPrevSeg(state->stoppoint, _logSegNo, wal_segment_size);
   10001            5 :         XLogFileName(lastxlogfilename, state->stoptli, _logSegNo,
   10002              :                      wal_segment_size);
   10003              : 
   10004            5 :         XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
   10005            5 :         BackupHistoryFileName(histfilename, state->stoptli, _logSegNo,
   10006              :                               state->startpoint, wal_segment_size);
   10007              : 
   10008            5 :         seconds_before_warning = 60;
   10009            5 :         waits = 0;
   10010              : 
   10011           15 :         while (XLogArchiveIsBusy(lastxlogfilename) ||
   10012            5 :                XLogArchiveIsBusy(histfilename))
   10013              :         {
   10014            5 :             CHECK_FOR_INTERRUPTS();
   10015              : 
   10016            5 :             if (!reported_waiting && waits > 5)
   10017              :             {
   10018            0 :                 ereport(NOTICE,
   10019              :                         (errmsg("base backup done, waiting for required WAL segments to be archived")));
   10020            0 :                 reported_waiting = true;
   10021              :             }
   10022              : 
   10023            5 :             (void) WaitLatch(MyLatch,
   10024              :                              WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
   10025              :                              1000L,
   10026              :                              WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
   10027            5 :             ResetLatch(MyLatch);
   10028              : 
   10029            5 :             if (++waits >= seconds_before_warning)
   10030              :             {
   10031            0 :                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
   10032            0 :                 ereport(WARNING,
   10033              :                         (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
   10034              :                                 waits),
   10035              :                          errhint("Check that your \"archive_command\" is executing properly.  "
   10036              :                                  "You can safely cancel this backup, "
   10037              :                                  "but the database backup will not be usable without all the WAL segments.")));
   10038              :             }
   10039              :         }
   10040              : 
   10041            5 :         ereport(NOTICE,
   10042              :                 (errmsg("all required WAL segments have been archived")));
   10043              :     }
   10044          165 :     else if (waitforarchive)
   10045            6 :         ereport(NOTICE,
   10046              :                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
   10047          170 : }
   10048              : 
   10049              : 
   10050              : /*
   10051              :  * do_pg_abort_backup: abort a running backup
   10052              :  *
   10053              :  * This does just the most basic steps of do_pg_backup_stop(), by taking the
   10054              :  * system out of backup mode, thus making it a lot more safe to call from
   10055              :  * an error handler.
   10056              :  *
   10057              :  * 'arg' indicates that it's being called during backup setup; so
   10058              :  * sessionBackupState has not been modified yet, but runningBackups has
   10059              :  * already been incremented.  When it's false, then it's invoked as a
   10060              :  * before_shmem_exit handler, and therefore we must not change state
   10061              :  * unless sessionBackupState indicates that a backup is actually running.
   10062              :  *
   10063              :  * NB: This gets used as a PG_ENSURE_ERROR_CLEANUP callback and
   10064              :  * before_shmem_exit handler, hence the odd-looking signature.
   10065              :  */
   10066              : void
   10067            9 : do_pg_abort_backup(int code, Datum arg)
   10068              : {
   10069            9 :     bool        during_backup_start = DatumGetBool(arg);
   10070              : 
   10071              :     /* If called during backup start, there shouldn't be one already running */
   10072              :     Assert(!during_backup_start || sessionBackupState == SESSION_BACKUP_NONE);
   10073              : 
   10074            9 :     if (during_backup_start || sessionBackupState != SESSION_BACKUP_NONE)
   10075              :     {
   10076            6 :         WALInsertLockAcquireExclusive();
   10077              :         Assert(XLogCtl->Insert.runningBackups > 0);
   10078            6 :         XLogCtl->Insert.runningBackups--;
   10079              : 
   10080            6 :         sessionBackupState = SESSION_BACKUP_NONE;
   10081            6 :         WALInsertLockRelease();
   10082              : 
   10083            6 :         if (!during_backup_start)
   10084            6 :             ereport(WARNING,
   10085              :                     errmsg("aborting backup due to backend exiting before pg_backup_stop was called"));
   10086              :     }
   10087            9 : }
   10088              : 
   10089              : /*
   10090              :  * Register a handler that will warn about unterminated backups at end of
   10091              :  * session, unless this has already been done.
   10092              :  */
   10093              : void
   10094            5 : register_persistent_abort_backup_handler(void)
   10095              : {
   10096              :     static bool already_done = false;
   10097              : 
   10098            5 :     if (already_done)
   10099            1 :         return;
   10100            4 :     before_shmem_exit(do_pg_abort_backup, BoolGetDatum(false));
   10101            4 :     already_done = true;
   10102              : }
   10103              : 
   10104              : /*
   10105              :  * Get latest WAL insert pointer
   10106              :  */
   10107              : XLogRecPtr
   10108         2135 : GetXLogInsertRecPtr(void)
   10109              : {
   10110         2135 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
   10111              :     uint64      current_bytepos;
   10112              : 
   10113         2135 :     SpinLockAcquire(&Insert->insertpos_lck);
   10114         2135 :     current_bytepos = Insert->CurrBytePos;
   10115         2135 :     SpinLockRelease(&Insert->insertpos_lck);
   10116              : 
   10117         2135 :     return XLogBytePosToRecPtr(current_bytepos);
   10118              : }
   10119              : 
   10120              : /*
   10121              :  * Get latest WAL record end pointer
   10122              :  */
   10123              : XLogRecPtr
   10124       117174 : GetXLogInsertEndRecPtr(void)
   10125              : {
   10126       117174 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
   10127              :     uint64      current_bytepos;
   10128              : 
   10129       117174 :     SpinLockAcquire(&Insert->insertpos_lck);
   10130       117174 :     current_bytepos = Insert->CurrBytePos;
   10131       117174 :     SpinLockRelease(&Insert->insertpos_lck);
   10132              : 
   10133       117174 :     return XLogBytePosToEndRecPtr(current_bytepos);
   10134              : }
   10135              : 
   10136              : /*
   10137              :  * Get latest WAL write pointer
   10138              :  */
   10139              : XLogRecPtr
   10140        10554 : GetXLogWriteRecPtr(void)
   10141              : {
   10142        10554 :     RefreshXLogWriteResult(LogwrtResult);
   10143              : 
   10144        10554 :     return LogwrtResult.Write;
   10145              : }
   10146              : 
   10147              : /*
   10148              :  * Returns the redo pointer of the last checkpoint or restartpoint. This is
   10149              :  * the oldest point in WAL that we still need, if we have to restart recovery.
   10150              :  */
   10151              : void
   10152          391 : GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
   10153              : {
   10154          391 :     LWLockAcquire(ControlFileLock, LW_SHARED);
   10155          391 :     *oldrecptr = ControlFile->checkPointCopy.redo;
   10156          391 :     *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
   10157          391 :     LWLockRelease(ControlFileLock);
   10158          391 : }
   10159              : 
   10160              : /* Thin wrapper around ShutdownWalRcv(). */
   10161              : void
   10162         1068 : XLogShutdownWalRcv(void)
   10163              : {
   10164              :     Assert(AmStartupProcess() || !IsUnderPostmaster);
   10165              : 
   10166         1068 :     ShutdownWalRcv();
   10167         1068 :     ResetInstallXLogFileSegmentActive();
   10168         1068 : }
   10169              : 
   10170              : /* Enable WAL file recycling and preallocation. */
   10171              : void
   10172         1262 : SetInstallXLogFileSegmentActive(void)
   10173              : {
   10174         1262 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
   10175         1262 :     XLogCtl->InstallXLogFileSegmentActive = true;
   10176         1262 :     LWLockRelease(ControlFileLock);
   10177         1262 : }
   10178              : 
   10179              : /* Disable WAL file recycling and preallocation. */
   10180              : void
   10181         1222 : ResetInstallXLogFileSegmentActive(void)
   10182              : {
   10183         1222 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
   10184         1222 :     XLogCtl->InstallXLogFileSegmentActive = false;
   10185         1222 :     LWLockRelease(ControlFileLock);
   10186         1222 : }
   10187              : 
   10188              : bool
   10189            0 : IsInstallXLogFileSegmentActive(void)
   10190              : {
   10191              :     bool        result;
   10192              : 
   10193            0 :     LWLockAcquire(ControlFileLock, LW_SHARED);
   10194            0 :     result = XLogCtl->InstallXLogFileSegmentActive;
   10195            0 :     LWLockRelease(ControlFileLock);
   10196              : 
   10197            0 :     return result;
   10198              : }
   10199              : 
   10200              : /*
   10201              :  * Update the WalWriterSleeping flag.
   10202              :  */
   10203              : void
   10204          612 : SetWalWriterSleeping(bool sleeping)
   10205              : {
   10206          612 :     SpinLockAcquire(&XLogCtl->info_lck);
   10207          612 :     XLogCtl->WalWriterSleeping = sleeping;
   10208          612 :     SpinLockRelease(&XLogCtl->info_lck);
   10209          612 : }
        

Generated by: LCOV version 2.0-1