LCOV - code coverage report
Current view: top level - src/backend/access/transam - xlog.c (source / functions) Coverage Total Hit
Test: PostgreSQL 20devel Lines: 89.0 % 2676 2382
Test Date: 2026-07-03 19:57:34 Functions: 97.1 % 139 135
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
Branches: 66.2 % 1557 1031

             Branch data     Line data    Source code
       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * xlog.c
       4                 :             :  *      PostgreSQL write-ahead log manager
       5                 :             :  *
       6                 :             :  * The Write-Ahead Log (WAL) functionality is split into several source
       7                 :             :  * files, in addition to this one:
       8                 :             :  *
       9                 :             :  * xloginsert.c - Functions for constructing WAL records
      10                 :             :  * xlogrecovery.c - WAL recovery and standby code
      11                 :             :  * xlogreader.c - Facility for reading WAL files and parsing WAL records
      12                 :             :  * xlogutils.c - Helper functions for WAL redo routines
      13                 :             :  *
      14                 :             :  * This file contains functions for coordinating database startup and
      15                 :             :  * checkpointing, and managing the write-ahead log buffers when the
      16                 :             :  * system is running.
      17                 :             :  *
      18                 :             :  * StartupXLOG() is the main entry point of the startup process.  It
      19                 :             :  * coordinates database startup, performing WAL recovery, and the
      20                 :             :  * transition from WAL recovery into normal operations.
      21                 :             :  *
      22                 :             :  * XLogInsertRecord() inserts a WAL record into the WAL buffers.  Most
      23                 :             :  * callers should not call this directly, but use the functions in
      24                 :             :  * xloginsert.c to construct the WAL record.  XLogFlush() can be used
      25                 :             :  * to force the WAL to disk.
      26                 :             :  *
      27                 :             :  * In addition to those, there are many other functions for interrogating
      28                 :             :  * the current system state, and for starting/stopping backups.
      29                 :             :  *
      30                 :             :  *
      31                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      32                 :             :  * Portions Copyright (c) 1994, Regents of the University of California
      33                 :             :  *
      34                 :             :  * src/backend/access/transam/xlog.c
      35                 :             :  *
      36                 :             :  *-------------------------------------------------------------------------
      37                 :             :  */
      38                 :             : 
      39                 :             : #include "postgres.h"
      40                 :             : 
      41                 :             : #include <ctype.h>
      42                 :             : #include <math.h>
      43                 :             : #include <time.h>
      44                 :             : #include <fcntl.h>
      45                 :             : #include <sys/stat.h>
      46                 :             : #include <sys/time.h>
      47                 :             : #include <unistd.h>
      48                 :             : 
      49                 :             : #include "access/clog.h"
      50                 :             : #include "access/commit_ts.h"
      51                 :             : #include "access/heaptoast.h"
      52                 :             : #include "access/multixact.h"
      53                 :             : #include "access/rewriteheap.h"
      54                 :             : #include "access/subtrans.h"
      55                 :             : #include "access/timeline.h"
      56                 :             : #include "access/transam.h"
      57                 :             : #include "access/twophase.h"
      58                 :             : #include "access/xact.h"
      59                 :             : #include "access/xlog_internal.h"
      60                 :             : #include "access/xlogarchive.h"
      61                 :             : #include "access/xloginsert.h"
      62                 :             : #include "access/xlogreader.h"
      63                 :             : #include "access/xlogrecovery.h"
      64                 :             : #include "access/xlogutils.h"
      65                 :             : #include "access/xlogwait.h"
      66                 :             : #include "backup/basebackup.h"
      67                 :             : #include "catalog/catversion.h"
      68                 :             : #include "catalog/pg_control.h"
      69                 :             : #include "catalog/pg_database.h"
      70                 :             : #include "common/controldata_utils.h"
      71                 :             : #include "common/file_utils.h"
      72                 :             : #include "executor/instrument.h"
      73                 :             : #include "miscadmin.h"
      74                 :             : #include "pg_trace.h"
      75                 :             : #include "pgstat.h"
      76                 :             : #include "port/atomics.h"
      77                 :             : #include "postmaster/bgwriter.h"
      78                 :             : #include "postmaster/datachecksum_state.h"
      79                 :             : #include "postmaster/startup.h"
      80                 :             : #include "postmaster/walsummarizer.h"
      81                 :             : #include "postmaster/walwriter.h"
      82                 :             : #include "replication/origin.h"
      83                 :             : #include "replication/slot.h"
      84                 :             : #include "replication/slotsync.h"
      85                 :             : #include "replication/snapbuild.h"
      86                 :             : #include "replication/walreceiver.h"
      87                 :             : #include "replication/walsender.h"
      88                 :             : #include "storage/bufmgr.h"
      89                 :             : #include "storage/fd.h"
      90                 :             : #include "storage/ipc.h"
      91                 :             : #include "storage/large_object.h"
      92                 :             : #include "storage/latch.h"
      93                 :             : #include "storage/predicate.h"
      94                 :             : #include "storage/proc.h"
      95                 :             : #include "storage/procarray.h"
      96                 :             : #include "storage/procsignal.h"
      97                 :             : #include "storage/reinit.h"
      98                 :             : #include "storage/spin.h"
      99                 :             : #include "storage/subsystems.h"
     100                 :             : #include "storage/sync.h"
     101                 :             : #include "utils/guc_hooks.h"
     102                 :             : #include "utils/guc_tables.h"
     103                 :             : #include "utils/injection_point.h"
     104                 :             : #include "utils/pgstat_internal.h"
     105                 :             : #include "utils/ps_status.h"
     106                 :             : #include "utils/relmapper.h"
     107                 :             : #include "utils/snapmgr.h"
     108                 :             : #include "utils/timeout.h"
     109                 :             : #include "utils/timestamp.h"
     110                 :             : #include "utils/varlena.h"
     111                 :             : #include "utils/wait_event.h"
     112                 :             : 
     113                 :             : #ifdef WAL_DEBUG
     114                 :             : #include "utils/memutils.h"
     115                 :             : #endif
     116                 :             : 
     117                 :             : /* timeline ID to be used when bootstrapping */
     118                 :             : #define BootstrapTimeLineID     1
     119                 :             : 
     120                 :             : /* User-settable parameters */
     121                 :             : int         max_wal_size_mb = 1024; /* 1 GB */
     122                 :             : int         min_wal_size_mb = 80;   /* 80 MB */
     123                 :             : int         wal_keep_size_mb = 0;
     124                 :             : int         XLOGbuffers = -1;
     125                 :             : int         XLogArchiveTimeout = 0;
     126                 :             : int         XLogArchiveMode = ARCHIVE_MODE_OFF;
     127                 :             : char       *XLogArchiveCommand = NULL;
     128                 :             : bool        EnableHotStandby = false;
     129                 :             : bool        fullPageWrites = true;
     130                 :             : bool        wal_log_hints = false;
     131                 :             : int         wal_compression = WAL_COMPRESSION_NONE;
     132                 :             : char       *wal_consistency_checking_string = NULL;
     133                 :             : bool       *wal_consistency_checking = NULL;
     134                 :             : bool        wal_init_zero = true;
     135                 :             : bool        wal_recycle = true;
     136                 :             : bool        log_checkpoints = true;
     137                 :             : int         wal_sync_method = DEFAULT_WAL_SYNC_METHOD;
     138                 :             : int         wal_level = WAL_LEVEL_REPLICA;
     139                 :             : int         CommitDelay = 0;    /* precommit delay in microseconds */
     140                 :             : int         CommitSiblings = 5; /* # concurrent xacts needed to sleep */
     141                 :             : int         wal_retrieve_retry_interval = 5000;
     142                 :             : int         max_slot_wal_keep_size_mb = -1;
     143                 :             : int         wal_decode_buffer_size = 512 * 1024;
     144                 :             : bool        track_wal_io_timing = false;
     145                 :             : 
     146                 :             : #ifdef WAL_DEBUG
     147                 :             : bool        XLOG_DEBUG = false;
     148                 :             : #endif
     149                 :             : 
     150                 :             : int         wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
     151                 :             : 
     152                 :             : /*
     153                 :             :  * Number of WAL insertion locks to use. A higher value allows more insertions
     154                 :             :  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
     155                 :             :  * which needs to iterate all the locks.
     156                 :             :  */
     157                 :             : #define NUM_XLOGINSERT_LOCKS  8
     158                 :             : 
     159                 :             : /*
     160                 :             :  * Max distance from last checkpoint, before triggering a new xlog-based
     161                 :             :  * checkpoint.
     162                 :             :  */
     163                 :             : int         CheckPointSegments;
     164                 :             : 
     165                 :             : /* Estimated distance between checkpoints, in bytes */
     166                 :             : static double CheckPointDistanceEstimate = 0;
     167                 :             : static double PrevCheckPointDistance = 0;
     168                 :             : 
     169                 :             : /*
     170                 :             :  * Track whether there were any deferred checks for custom resource managers
     171                 :             :  * specified in wal_consistency_checking.
     172                 :             :  */
     173                 :             : static bool check_wal_consistency_checking_deferred = false;
     174                 :             : 
     175                 :             : /*
     176                 :             :  * GUC support
     177                 :             :  */
     178                 :             : const struct config_enum_entry wal_sync_method_options[] = {
     179                 :             :     {"fsync", WAL_SYNC_METHOD_FSYNC, false},
     180                 :             : #ifdef HAVE_FSYNC_WRITETHROUGH
     181                 :             :     {"fsync_writethrough", WAL_SYNC_METHOD_FSYNC_WRITETHROUGH, false},
     182                 :             : #endif
     183                 :             :     {"fdatasync", WAL_SYNC_METHOD_FDATASYNC, false},
     184                 :             : #ifdef O_SYNC
     185                 :             :     {"open_sync", WAL_SYNC_METHOD_OPEN, false},
     186                 :             : #endif
     187                 :             : #ifdef O_DSYNC
     188                 :             :     {"open_datasync", WAL_SYNC_METHOD_OPEN_DSYNC, false},
     189                 :             : #endif
     190                 :             :     {NULL, 0, false}
     191                 :             : };
     192                 :             : 
     193                 :             : 
     194                 :             : /*
     195                 :             :  * Although only "on", "off", and "always" are documented,
     196                 :             :  * we accept all the likely variants of "on" and "off".
     197                 :             :  */
     198                 :             : const struct config_enum_entry archive_mode_options[] = {
     199                 :             :     {"always", ARCHIVE_MODE_ALWAYS, false},
     200                 :             :     {"on", ARCHIVE_MODE_ON, false},
     201                 :             :     {"off", ARCHIVE_MODE_OFF, false},
     202                 :             :     {"true", ARCHIVE_MODE_ON, true},
     203                 :             :     {"false", ARCHIVE_MODE_OFF, true},
     204                 :             :     {"yes", ARCHIVE_MODE_ON, true},
     205                 :             :     {"no", ARCHIVE_MODE_OFF, true},
     206                 :             :     {"1", ARCHIVE_MODE_ON, true},
     207                 :             :     {"0", ARCHIVE_MODE_OFF, true},
     208                 :             :     {NULL, 0, false}
     209                 :             : };
     210                 :             : 
     211                 :             : /*
     212                 :             :  * Statistics for current checkpoint are collected in this global struct.
     213                 :             :  * Because only the checkpointer or a stand-alone backend can perform
     214                 :             :  * checkpoints, this will be unused in normal backends.
     215                 :             :  */
     216                 :             : CheckpointStatsData CheckpointStats;
     217                 :             : 
     218                 :             : /*
     219                 :             :  * During recovery, lastFullPageWrites keeps track of full_page_writes that
     220                 :             :  * the replayed WAL records indicate. It's initialized with full_page_writes
     221                 :             :  * that the recovery starting checkpoint record indicates, and then updated
     222                 :             :  * each time XLOG_FPW_CHANGE record is replayed.
     223                 :             :  */
     224                 :             : static bool lastFullPageWrites;
     225                 :             : 
     226                 :             : /*
     227                 :             :  * Local copy of the state tracked by SharedRecoveryState in shared memory,
     228                 :             :  * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
     229                 :             :  * means "not known, need to check the shared state".
     230                 :             :  */
     231                 :             : static bool LocalRecoveryInProgress = true;
     232                 :             : 
     233                 :             : /*
     234                 :             :  * Local state for XLogInsertAllowed():
     235                 :             :  *      1: unconditionally allowed to insert XLOG
     236                 :             :  *      0: unconditionally not allowed to insert XLOG
     237                 :             :  *      -1: must check RecoveryInProgress(); disallow until it is false
     238                 :             :  * Most processes start with -1 and transition to 1 after seeing that recovery
     239                 :             :  * is not in progress.  But we can also force the value for special cases.
     240                 :             :  * The coding in XLogInsertAllowed() depends on the first two of these states
     241                 :             :  * being numerically the same as bool true and false.
     242                 :             :  */
     243                 :             : static int  LocalXLogInsertAllowed = -1;
     244                 :             : 
     245                 :             : /*
     246                 :             :  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
     247                 :             :  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
     248                 :             :  * end+1 of the last record, and is reset when we end a top-level transaction,
     249                 :             :  * or start a new one; so it can be used to tell if the current transaction has
     250                 :             :  * created any XLOG records.
     251                 :             :  *
     252                 :             :  * While in parallel mode, this may not be fully up to date.  When committing,
     253                 :             :  * a transaction can assume this covers all xlog records written either by the
     254                 :             :  * user backend or by any parallel worker which was present at any point during
     255                 :             :  * the transaction.  But when aborting, or when still in parallel mode, other
     256                 :             :  * parallel backends may have written WAL records at later LSNs than the value
     257                 :             :  * stored here.  The parallel leader advances its own copy, when necessary,
     258                 :             :  * in WaitForParallelWorkersToFinish.
     259                 :             :  */
     260                 :             : XLogRecPtr  ProcLastRecPtr = InvalidXLogRecPtr;
     261                 :             : XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
     262                 :             : XLogRecPtr  XactLastCommitEnd = InvalidXLogRecPtr;
     263                 :             : 
     264                 :             : /*
     265                 :             :  * RedoRecPtr is this backend's local copy of the REDO record pointer
     266                 :             :  * (which is almost but not quite the same as a pointer to the most recent
     267                 :             :  * CHECKPOINT record).  We update this from the shared-memory copy,
     268                 :             :  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
     269                 :             :  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
     270                 :             :  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
     271                 :             :  * see GetRedoRecPtr.
     272                 :             :  *
     273                 :             :  * NB: Code that uses this variable must be prepared not only for the
     274                 :             :  * possibility that it may be arbitrarily out of date, but also for the
     275                 :             :  * possibility that it might be set to InvalidXLogRecPtr. We used to
     276                 :             :  * initialize it as a side effect of the first call to RecoveryInProgress(),
     277                 :             :  * which meant that most code that might use it could assume that it had a
     278                 :             :  * real if perhaps stale value. That's no longer the case.
     279                 :             :  */
     280                 :             : static XLogRecPtr RedoRecPtr;
     281                 :             : 
     282                 :             : /*
     283                 :             :  * doPageWrites is this backend's local copy of (fullPageWrites ||
     284                 :             :  * runningBackups > 0).  It is used together with RedoRecPtr to decide whether
     285                 :             :  * a full-page image of a page need to be taken.
     286                 :             :  *
     287                 :             :  * NB: Initially this is false, and there's no guarantee that it will be
     288                 :             :  * initialized to any other value before it is first used. Any code that
     289                 :             :  * makes use of it must recheck the value after obtaining a WALInsertLock,
     290                 :             :  * and respond appropriately if it turns out that the previous value wasn't
     291                 :             :  * accurate.
     292                 :             :  */
     293                 :             : static bool doPageWrites;
     294                 :             : 
     295                 :             : /*----------
     296                 :             :  * Shared-memory data structures for XLOG control
     297                 :             :  *
     298                 :             :  * LogwrtRqst indicates a byte position that we need to write and/or fsync
     299                 :             :  * the log up to (all records before that point must be written or fsynced).
     300                 :             :  * The positions already written/fsynced are maintained in logWriteResult
     301                 :             :  * and logFlushResult using atomic access.
     302                 :             :  * In addition to the shared variable, each backend has a private copy of
     303                 :             :  * both in LogwrtResult, which is updated when convenient.
     304                 :             :  *
     305                 :             :  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
     306                 :             :  * (protected by info_lck), but we don't need to cache any copies of it.
     307                 :             :  *
     308                 :             :  * info_lck is only held long enough to read/update the protected variables,
     309                 :             :  * so it's a plain spinlock.  The other locks are held longer (potentially
     310                 :             :  * over I/O operations), so we use LWLocks for them.  These locks are:
     311                 :             :  *
     312                 :             :  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
     313                 :             :  * It is only held while initializing and changing the mapping.  If the
     314                 :             :  * contents of the buffer being replaced haven't been written yet, the mapping
     315                 :             :  * lock is released while the write is done, and reacquired afterwards.
     316                 :             :  *
     317                 :             :  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
     318                 :             :  * XLogFlush).
     319                 :             :  *
     320                 :             :  * ControlFileLock: must be held to read/update control file or create
     321                 :             :  * new log file.
     322                 :             :  *
     323                 :             :  *----------
     324                 :             :  */
     325                 :             : 
     326                 :             : typedef struct XLogwrtRqst
     327                 :             : {
     328                 :             :     XLogRecPtr  Write;          /* last byte + 1 to write out */
     329                 :             :     XLogRecPtr  Flush;          /* last byte + 1 to flush */
     330                 :             : } XLogwrtRqst;
     331                 :             : 
     332                 :             : typedef struct XLogwrtResult
     333                 :             : {
     334                 :             :     XLogRecPtr  Write;          /* last byte + 1 written out */
     335                 :             :     XLogRecPtr  Flush;          /* last byte + 1 flushed */
     336                 :             : } XLogwrtResult;
     337                 :             : 
     338                 :             : /*
     339                 :             :  * Inserting to WAL is protected by a small fixed number of WAL insertion
     340                 :             :  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
     341                 :             :  * matter which one. To lock out other concurrent insertions, you must hold
     342                 :             :  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
     343                 :             :  * indicator of how far the insertion has progressed (insertingAt).
     344                 :             :  *
     345                 :             :  * The insertingAt values are read when a process wants to flush WAL from
     346                 :             :  * the in-memory buffers to disk, to check that all the insertions to the
     347                 :             :  * region the process is about to write out have finished. You could simply
     348                 :             :  * wait for all currently in-progress insertions to finish, but the
     349                 :             :  * insertingAt indicator allows you to ignore insertions to later in the WAL,
     350                 :             :  * so that you only wait for the insertions that are modifying the buffers
     351                 :             :  * you're about to write out.
     352                 :             :  *
     353                 :             :  * This isn't just an optimization. If all the WAL buffers are dirty, an
     354                 :             :  * inserter that's holding a WAL insert lock might need to evict an old WAL
     355                 :             :  * buffer, which requires flushing the WAL. If it's possible for an inserter
     356                 :             :  * to block on another inserter unnecessarily, deadlock can arise when two
     357                 :             :  * inserters holding a WAL insert lock wait for each other to finish their
     358                 :             :  * insertion.
     359                 :             :  *
     360                 :             :  * Small WAL records that don't cross a page boundary never update the value,
     361                 :             :  * the WAL record is just copied to the page and the lock is released. But
     362                 :             :  * to avoid the deadlock-scenario explained above, the indicator is always
     363                 :             :  * updated before sleeping while holding an insertion lock.
     364                 :             :  *
     365                 :             :  * lastImportantAt contains the LSN of the last important WAL record inserted
     366                 :             :  * using a given lock. This value is used to detect if there has been
     367                 :             :  * important WAL activity since the last time some action, like a checkpoint,
     368                 :             :  * was performed - allowing to not repeat the action if not. The LSN is
     369                 :             :  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
     370                 :             :  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
     371                 :             :  * records.  Tracking the WAL activity directly in WALInsertLock has the
     372                 :             :  * advantage of not needing any additional locks to update the value.
     373                 :             :  */
     374                 :             : typedef struct
     375                 :             : {
     376                 :             :     LWLock      lock;
     377                 :             :     pg_atomic_uint64 insertingAt;
     378                 :             :     XLogRecPtr  lastImportantAt;
     379                 :             : } WALInsertLock;
     380                 :             : 
     381                 :             : /*
     382                 :             :  * All the WAL insertion locks are allocated as an array in shared memory. We
     383                 :             :  * force the array stride to be a power of 2, which saves a few cycles in
     384                 :             :  * indexing, but more importantly also ensures that individual slots don't
     385                 :             :  * cross cache line boundaries. (Of course, we have to also ensure that the
     386                 :             :  * array start address is suitably aligned.)
     387                 :             :  */
     388                 :             : typedef union WALInsertLockPadded
     389                 :             : {
     390                 :             :     WALInsertLock l;
     391                 :             :     char        pad[PG_CACHE_LINE_SIZE];
     392                 :             : } WALInsertLockPadded;
     393                 :             : 
     394                 :             : /*
     395                 :             :  * Session status of running backup, used for sanity checks in SQL-callable
     396                 :             :  * functions to start and stop backups.
     397                 :             :  */
     398                 :             : static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
     399                 :             : 
     400                 :             : /*
     401                 :             :  * Shared state data for WAL insertion.
     402                 :             :  */
     403                 :             : typedef struct XLogCtlInsert
     404                 :             : {
     405                 :             :     slock_t     insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
     406                 :             : 
     407                 :             :     /*
     408                 :             :      * CurrBytePos is the end of reserved WAL. The next record will be
     409                 :             :      * inserted at that position. PrevBytePos is the start position of the
     410                 :             :      * previously inserted (or rather, reserved) record - it is copied to the
     411                 :             :      * prev-link of the next record. These are stored as "usable byte
     412                 :             :      * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
     413                 :             :      */
     414                 :             :     uint64      CurrBytePos;
     415                 :             :     uint64      PrevBytePos;
     416                 :             : 
     417                 :             :     /*
     418                 :             :      * Make sure the above heavily-contended spinlock and byte positions are
     419                 :             :      * on their own cache line. In particular, the RedoRecPtr and full page
     420                 :             :      * write variables below should be on a different cache line. They are
     421                 :             :      * read on every WAL insertion, but updated rarely, and we don't want
     422                 :             :      * those reads to steal the cache line containing Curr/PrevBytePos.
     423                 :             :      */
     424                 :             :     char        pad[PG_CACHE_LINE_SIZE];
     425                 :             : 
     426                 :             :     /*
     427                 :             :      * fullPageWrites is the authoritative value used by all backends to
     428                 :             :      * determine whether to write full-page image to WAL. This shared value,
     429                 :             :      * instead of the process-local fullPageWrites, is required because, when
     430                 :             :      * full_page_writes is changed by SIGHUP, we must WAL-log it before it
     431                 :             :      * actually affects WAL-logging by backends.  Checkpointer sets at startup
     432                 :             :      * or after SIGHUP.
     433                 :             :      *
     434                 :             :      * To read these fields, you must hold an insertion lock. To modify them,
     435                 :             :      * you must hold ALL the locks.
     436                 :             :      */
     437                 :             :     XLogRecPtr  RedoRecPtr;     /* current redo point for insertions */
     438                 :             :     bool        fullPageWrites;
     439                 :             : 
     440                 :             :     /*
     441                 :             :      * runningBackups is a counter indicating the number of backups currently
     442                 :             :      * in progress. lastBackupStart is the latest checkpoint redo location
     443                 :             :      * used as a starting point for an online backup.
     444                 :             :      */
     445                 :             :     int         runningBackups;
     446                 :             :     XLogRecPtr  lastBackupStart;
     447                 :             : 
     448                 :             :     /*
     449                 :             :      * WAL insertion locks.
     450                 :             :      */
     451                 :             :     WALInsertLockPadded *WALInsertLocks;
     452                 :             : } XLogCtlInsert;
     453                 :             : 
     454                 :             : /*
     455                 :             :  * Total shared-memory state for XLOG.
     456                 :             :  */
     457                 :             : typedef struct XLogCtlData
     458                 :             : {
     459                 :             :     XLogCtlInsert Insert;
     460                 :             : 
     461                 :             :     /* Protected by info_lck: */
     462                 :             :     XLogwrtRqst LogwrtRqst;
     463                 :             :     XLogRecPtr  RedoRecPtr;     /* a recent copy of Insert->RedoRecPtr */
     464                 :             :     XLogRecPtr  asyncXactLSN;   /* LSN of newest async commit/abort */
     465                 :             :     XLogRecPtr  replicationSlotMinLSN;  /* oldest LSN needed by any slot */
     466                 :             : 
     467                 :             :     XLogSegNo   lastRemovedSegNo;   /* latest removed/recycled XLOG segment */
     468                 :             : 
     469                 :             :     /* Fake LSN counter, for unlogged relations. */
     470                 :             :     pg_atomic_uint64 unloggedLSN;
     471                 :             : 
     472                 :             :     /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
     473                 :             :     pg_time_t   lastSegSwitchTime;
     474                 :             :     XLogRecPtr  lastSegSwitchLSN;
     475                 :             : 
     476                 :             :     /* These are accessed using atomics -- info_lck not needed */
     477                 :             :     pg_atomic_uint64 logInsertResult;   /* last byte + 1 inserted to buffers */
     478                 :             :     pg_atomic_uint64 logWriteResult;    /* last byte + 1 written out */
     479                 :             :     pg_atomic_uint64 logFlushResult;    /* last byte + 1 flushed */
     480                 :             : 
     481                 :             :     /*
     482                 :             :      * Latest initialized page in the cache (last byte position + 1).
     483                 :             :      *
     484                 :             :      * To change the identity of a buffer (and InitializedUpTo), you need to
     485                 :             :      * hold WALBufMappingLock.  To change the identity of a buffer that's
     486                 :             :      * still dirty, the old page needs to be written out first, and for that
     487                 :             :      * you need WALWriteLock, and you need to ensure that there are no
     488                 :             :      * in-progress insertions to the page by calling
     489                 :             :      * WaitXLogInsertionsToFinish().
     490                 :             :      */
     491                 :             :     XLogRecPtr  InitializedUpTo;
     492                 :             : 
     493                 :             :     /*
     494                 :             :      * These values do not change after startup, although the pointed-to pages
     495                 :             :      * and xlblocks values certainly do.  xlblocks values are protected by
     496                 :             :      * WALBufMappingLock.
     497                 :             :      */
     498                 :             :     char       *pages;          /* buffers for unwritten XLOG pages */
     499                 :             :     pg_atomic_uint64 *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
     500                 :             :     int         XLogCacheBlck;  /* highest allocated xlog buffer index */
     501                 :             : 
     502                 :             :     /*
     503                 :             :      * InsertTimeLineID is the timeline into which new WAL is being inserted
     504                 :             :      * and flushed. It is zero during recovery, and does not change once set.
     505                 :             :      *
     506                 :             :      * If we create a new timeline when the system was started up,
     507                 :             :      * PrevTimeLineID is the old timeline's ID that we forked off from.
     508                 :             :      * Otherwise it's equal to InsertTimeLineID.
     509                 :             :      *
     510                 :             :      * We set these fields while holding info_lck. Most that reads these
     511                 :             :      * values knows that recovery is no longer in progress and so can safely
     512                 :             :      * read the value without a lock, but code that could be run either during
     513                 :             :      * or after recovery can take info_lck while reading these values.
     514                 :             :      */
     515                 :             :     TimeLineID  InsertTimeLineID;
     516                 :             :     TimeLineID  PrevTimeLineID;
     517                 :             : 
     518                 :             :     /*
     519                 :             :      * SharedRecoveryState indicates if we're still in crash or archive
     520                 :             :      * recovery.  Protected by info_lck.
     521                 :             :      */
     522                 :             :     RecoveryState SharedRecoveryState;
     523                 :             : 
     524                 :             :     /*
     525                 :             :      * InstallXLogFileSegmentActive indicates whether the checkpointer should
     526                 :             :      * arrange for future segments by recycling and/or PreallocXlogFiles().
     527                 :             :      * Protected by ControlFileLock.  Only the startup process changes it.  If
     528                 :             :      * true, anyone can use InstallXLogFileSegment().  If false, the startup
     529                 :             :      * process owns the exclusive right to install segments, by reading from
     530                 :             :      * the archive and possibly replacing existing files.
     531                 :             :      */
     532                 :             :     bool        InstallXLogFileSegmentActive;
     533                 :             : 
     534                 :             :     /*
     535                 :             :      * WalWriterSleeping indicates whether the WAL writer is currently in
     536                 :             :      * low-power mode (and hence should be nudged if an async commit occurs).
     537                 :             :      * Protected by info_lck.
     538                 :             :      */
     539                 :             :     bool        WalWriterSleeping;
     540                 :             : 
     541                 :             :     /*
     542                 :             :      * During recovery, we keep a copy of the latest checkpoint record here.
     543                 :             :      * lastCheckPointRecPtr points to start of checkpoint record and
     544                 :             :      * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
     545                 :             :      * checkpointer when it wants to create a restartpoint.
     546                 :             :      *
     547                 :             :      * Protected by info_lck.
     548                 :             :      */
     549                 :             :     XLogRecPtr  lastCheckPointRecPtr;
     550                 :             :     XLogRecPtr  lastCheckPointEndPtr;
     551                 :             :     CheckPoint  lastCheckPoint;
     552                 :             : 
     553                 :             :     /*
     554                 :             :      * lastFpwDisableRecPtr points to the start of the last replayed
     555                 :             :      * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
     556                 :             :      */
     557                 :             :     XLogRecPtr  lastFpwDisableRecPtr;
     558                 :             : 
     559                 :             :     /* last data_checksum_version we've seen */
     560                 :             :     uint32      data_checksum_version;
     561                 :             : 
     562                 :             :     slock_t     info_lck;       /* locks shared variables shown above */
     563                 :             : } XLogCtlData;
     564                 :             : 
     565                 :             : /*
     566                 :             :  * Classification of XLogInsertRecord operations.
     567                 :             :  */
     568                 :             : typedef enum
     569                 :             : {
     570                 :             :     WALINSERT_NORMAL,
     571                 :             :     WALINSERT_SPECIAL_SWITCH,
     572                 :             :     WALINSERT_SPECIAL_CHECKPOINT
     573                 :             : } WalInsertClass;
     574                 :             : 
     575                 :             : static XLogCtlData *XLogCtl = NULL;
     576                 :             : 
     577                 :             : /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
     578                 :             : static WALInsertLockPadded *WALInsertLocks = NULL;
     579                 :             : 
     580                 :             : /*
     581                 :             :  * We maintain an image of pg_control in shared memory.
     582                 :             :  */
     583                 :             : static ControlFileData *LocalControlFile = NULL;
     584                 :             : static ControlFileData *ControlFile = NULL;
     585                 :             : 
     586                 :             : static void XLOGShmemRequest(void *arg);
     587                 :             : static void XLOGShmemInit(void *arg);
     588                 :             : static void XLOGShmemAttach(void *arg);
     589                 :             : 
     590                 :             : const ShmemCallbacks XLOGShmemCallbacks = {
     591                 :             :     .request_fn = XLOGShmemRequest,
     592                 :             :     .init_fn = XLOGShmemInit,
     593                 :             :     .attach_fn = XLOGShmemAttach,
     594                 :             : };
     595                 :             : 
     596                 :             : /*
     597                 :             :  * Calculate the amount of space left on the page after 'endptr'. Beware
     598                 :             :  * multiple evaluation!
     599                 :             :  */
     600                 :             : #define INSERT_FREESPACE(endptr)    \
     601                 :             :     (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
     602                 :             : 
     603                 :             : /* Macro to advance to next buffer index. */
     604                 :             : #define NextBufIdx(idx)     \
     605                 :             :         (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
     606                 :             : 
     607                 :             : /*
     608                 :             :  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
     609                 :             :  * would hold if it was in cache, the page containing 'recptr'.
     610                 :             :  */
     611                 :             : #define XLogRecPtrToBufIdx(recptr)  \
     612                 :             :     (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
     613                 :             : 
     614                 :             : /*
     615                 :             :  * These are the number of bytes in a WAL page usable for WAL data.
     616                 :             :  */
     617                 :             : #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
     618                 :             : 
     619                 :             : /*
     620                 :             :  * Convert values of GUCs measured in megabytes to equiv. segment count.
     621                 :             :  * Rounds down.
     622                 :             :  */
     623                 :             : #define ConvertToXSegs(x, segsize)  XLogMBVarToSegs((x), (segsize))
     624                 :             : 
     625                 :             : /* The number of bytes in a WAL segment usable for WAL data. */
     626                 :             : static int  UsableBytesInSegment;
     627                 :             : 
     628                 :             : /*
     629                 :             :  * Private, possibly out-of-date copy of shared LogwrtResult.
     630                 :             :  * See discussion above.
     631                 :             :  */
     632                 :             : static XLogwrtResult LogwrtResult = {0, 0};
     633                 :             : 
     634                 :             : /*
     635                 :             :  * Update local copy of shared XLogCtl->log{Write,Flush}Result
     636                 :             :  *
     637                 :             :  * It's critical that Flush always trails Write, so the order of the reads is
     638                 :             :  * important, as is the barrier.  See also XLogWrite.
     639                 :             :  */
     640                 :             : #define RefreshXLogWriteResult(_target) \
     641                 :             :     do { \
     642                 :             :         _target.Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult); \
     643                 :             :         pg_read_barrier(); \
     644                 :             :         _target.Write = pg_atomic_read_u64(&XLogCtl->logWriteResult); \
     645                 :             :     } while (0)
     646                 :             : 
     647                 :             : /*
     648                 :             :  * openLogFile is -1 or a kernel FD for an open log file segment.
     649                 :             :  * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
     650                 :             :  * These variables are only used to write the XLOG, and so will normally refer
     651                 :             :  * to the active segment.
     652                 :             :  *
     653                 :             :  * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
     654                 :             :  */
     655                 :             : static int  openLogFile = -1;
     656                 :             : static XLogSegNo openLogSegNo = 0;
     657                 :             : static TimeLineID openLogTLI = 0;
     658                 :             : 
     659                 :             : /*
     660                 :             :  * Local copies of equivalent fields in the control file.  When running
     661                 :             :  * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
     662                 :             :  * expect to replay all the WAL available, and updateMinRecoveryPoint is
     663                 :             :  * switched to false to prevent any updates while replaying records.
     664                 :             :  * Those values are kept consistent as long as crash recovery runs.
     665                 :             :  */
     666                 :             : static XLogRecPtr LocalMinRecoveryPoint;
     667                 :             : static TimeLineID LocalMinRecoveryPointTLI;
     668                 :             : static bool updateMinRecoveryPoint = true;
     669                 :             : 
     670                 :             : /*
     671                 :             :  * Local state for ControlFile data_checksum_version.  After initialization
     672                 :             :  * this is only updated when absorbing a procsignal barrier during interrupt
     673                 :             :  * processing.  The reason for keeping a copy in backend-private memory is to
     674                 :             :  * avoid locking for interrogating the data checksum state.  Possible values
     675                 :             :  * are the data checksum versions defined in storage/checksum.h.
     676                 :             :  */
     677                 :             : static ChecksumStateType LocalDataChecksumState = 0;
     678                 :             : 
     679                 :             : /*
     680                 :             :  * Variable backing the GUC, keep it in sync with LocalDataChecksumState.
     681                 :             :  * See SetLocalDataChecksumState().
     682                 :             :  */
     683                 :             : int         data_checksums = 0;
     684                 :             : 
     685                 :             : /* For WALInsertLockAcquire/Release functions */
     686                 :             : static int  MyLockNo = 0;
     687                 :             : static bool holdingAllLocks = false;
     688                 :             : 
     689                 :             : #ifdef WAL_DEBUG
     690                 :             : static MemoryContext walDebugCxt = NULL;
     691                 :             : #endif
     692                 :             : 
     693                 :             : static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
     694                 :             :                                         XLogRecPtr EndOfLog,
     695                 :             :                                         TimeLineID newTLI);
     696                 :             : static void CheckRequiredParameterValues(void);
     697                 :             : static void XLogReportParameters(void);
     698                 :             : static int  LocalSetXLogInsertAllowed(void);
     699                 :             : static void CreateEndOfRecoveryRecord(void);
     700                 :             : static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
     701                 :             :                                                   XLogRecPtr pagePtr,
     702                 :             :                                                   TimeLineID newTLI);
     703                 :             : static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
     704                 :             : static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
     705                 :             : 
     706                 :             : static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
     707                 :             :                                   bool opportunistic);
     708                 :             : static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
     709                 :             : static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
     710                 :             :                                    bool find_free, XLogSegNo max_segno,
     711                 :             :                                    TimeLineID tli);
     712                 :             : static void XLogFileClose(void);
     713                 :             : static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
     714                 :             : static void RemoveTempXlogFiles(void);
     715                 :             : static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
     716                 :             :                                XLogRecPtr endptr, TimeLineID insertTLI);
     717                 :             : static void RemoveXlogFile(const struct dirent *segment_de,
     718                 :             :                            XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
     719                 :             :                            TimeLineID insertTLI);
     720                 :             : static void UpdateLastRemovedPtr(char *filename);
     721                 :             : static void ValidateXLOGDirectoryStructure(void);
     722                 :             : static void CleanupBackupHistory(void);
     723                 :             : static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
     724                 :             : static bool PerformRecoveryXLogAction(void);
     725                 :             : static void InitControlFile(uint64 sysidentifier, uint32 data_checksum_version);
     726                 :             : static void WriteControlFile(void);
     727                 :             : static void ReadControlFile(void);
     728                 :             : static void UpdateControlFile(void);
     729                 :             : static char *str_time(pg_time_t tnow, char *buf, size_t bufsize);
     730                 :             : 
     731                 :             : static int  get_sync_bit(int method);
     732                 :             : 
     733                 :             : static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
     734                 :             :                                 XLogRecData *rdata,
     735                 :             :                                 XLogRecPtr StartPos, XLogRecPtr EndPos,
     736                 :             :                                 TimeLineID tli);
     737                 :             : static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
     738                 :             :                                       XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
     739                 :             : static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
     740                 :             :                               XLogRecPtr *PrevPtr);
     741                 :             : static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
     742                 :             : static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
     743                 :             : static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
     744                 :             : static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
     745                 :             : static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
     746                 :             : 
     747                 :             : static void WALInsertLockAcquire(void);
     748                 :             : static void WALInsertLockAcquireExclusive(void);
     749                 :             : static void WALInsertLockRelease(void);
     750                 :             : static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
     751                 :             : 
     752                 :             : static void XLogChecksums(uint32 new_type);
     753                 :             : 
     754                 :             : /*
     755                 :             :  * Insert an XLOG record represented by an already-constructed chain of data
     756                 :             :  * chunks.  This is a low-level routine; to construct the WAL record header
     757                 :             :  * and data, use the higher-level routines in xloginsert.c.
     758                 :             :  *
     759                 :             :  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
     760                 :             :  * WAL record applies to, that were not included in the record as full page
     761                 :             :  * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
     762                 :             :  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
     763                 :             :  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
     764                 :             :  * record is always inserted.
     765                 :             :  *
     766                 :             :  * 'flags' gives more in-depth control on the record being inserted. See
     767                 :             :  * XLogSetRecordFlags() for details.
     768                 :             :  *
     769                 :             :  * 'topxid_included' tells whether the top-transaction id is logged along with
     770                 :             :  * current subtransaction. See XLogRecordAssemble().
     771                 :             :  *
     772                 :             :  * The first XLogRecData in the chain must be for the record header, and its
     773                 :             :  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
     774                 :             :  * xl_crc fields in the header, the rest of the header must already be filled
     775                 :             :  * by the caller.
     776                 :             :  *
     777                 :             :  * Returns XLOG pointer to end of record (beginning of next record).
     778                 :             :  * This can be used as LSN for data pages affected by the logged action.
     779                 :             :  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
     780                 :             :  * before the data page can be written out.  This implements the basic
     781                 :             :  * WAL rule "write the log before the data".)
     782                 :             :  */
     783                 :             : XLogRecPtr
     784                 :    24793029 : XLogInsertRecord(XLogRecData *rdata,
     785                 :             :                  XLogRecPtr fpw_lsn,
     786                 :             :                  uint8 flags,
     787                 :             :                  int num_fpi,
     788                 :             :                  uint64 fpi_bytes,
     789                 :             :                  bool topxid_included)
     790                 :             : {
     791                 :    24793029 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
     792                 :             :     pg_crc32c   rdata_crc;
     793                 :             :     bool        inserted;
     794                 :    24793029 :     XLogRecord *rechdr = (XLogRecord *) rdata->data;
     795                 :    24793029 :     uint8       info = rechdr->xl_info & ~XLR_INFO_MASK;
     796                 :    24793029 :     WalInsertClass class = WALINSERT_NORMAL;
     797                 :             :     XLogRecPtr  StartPos;
     798                 :             :     XLogRecPtr  EndPos;
     799                 :    24793029 :     bool        prevDoPageWrites = doPageWrites;
     800                 :             :     TimeLineID  insertTLI;
     801                 :             : 
     802                 :             :     /* Does this record type require special handling? */
     803         [ +  + ]:    24793029 :     if (unlikely(rechdr->xl_rmid == RM_XLOG_ID))
     804                 :             :     {
     805         [ +  + ]:      323369 :         if (info == XLOG_SWITCH)
     806                 :         818 :             class = WALINSERT_SPECIAL_SWITCH;
     807         [ +  + ]:      322551 :         else if (info == XLOG_CHECKPOINT_REDO)
     808                 :         999 :             class = WALINSERT_SPECIAL_CHECKPOINT;
     809                 :             :     }
     810                 :             : 
     811                 :             :     /* we assume that all of the record header is in the first chunk */
     812                 :             :     Assert(rdata->len >= SizeOfXLogRecord);
     813                 :             : 
     814                 :             :     /* cross-check on whether we should be here or not */
     815         [ -  + ]:    24793029 :     if (!XLogInsertAllowed())
     816         [ #  # ]:           0 :         elog(ERROR, "cannot make new WAL entries during recovery");
     817                 :             : 
     818                 :             :     /*
     819                 :             :      * Given that we're not in recovery, InsertTimeLineID is set and can't
     820                 :             :      * change, so we can read it without a lock.
     821                 :             :      */
     822                 :    24793029 :     insertTLI = XLogCtl->InsertTimeLineID;
     823                 :             : 
     824                 :             :     /*----------
     825                 :             :      *
     826                 :             :      * We have now done all the preparatory work we can without holding a
     827                 :             :      * lock or modifying shared state. From here on, inserting the new WAL
     828                 :             :      * record to the shared WAL buffer cache is a two-step process:
     829                 :             :      *
     830                 :             :      * 1. Reserve the right amount of space from the WAL. The current head of
     831                 :             :      *    reserved space is kept in Insert->CurrBytePos, and is protected by
     832                 :             :      *    insertpos_lck.
     833                 :             :      *
     834                 :             :      * 2. Copy the record to the reserved WAL space. This involves finding the
     835                 :             :      *    correct WAL buffer containing the reserved space, and copying the
     836                 :             :      *    record in place. This can be done concurrently in multiple processes.
     837                 :             :      *
     838                 :             :      * To keep track of which insertions are still in-progress, each concurrent
     839                 :             :      * inserter acquires an insertion lock. In addition to just indicating that
     840                 :             :      * an insertion is in progress, the lock tells others how far the inserter
     841                 :             :      * has progressed. There is a small fixed number of insertion locks,
     842                 :             :      * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
     843                 :             :      * boundary, it updates the value stored in the lock to the how far it has
     844                 :             :      * inserted, to allow the previous buffer to be flushed.
     845                 :             :      *
     846                 :             :      * Holding onto an insertion lock also protects RedoRecPtr and
     847                 :             :      * fullPageWrites from changing until the insertion is finished.
     848                 :             :      *
     849                 :             :      * Step 2 can usually be done completely in parallel. If the required WAL
     850                 :             :      * page is not initialized yet, you have to grab WALBufMappingLock to
     851                 :             :      * initialize it, but the WAL writer tries to do that ahead of insertions
     852                 :             :      * to avoid that from happening in the critical path.
     853                 :             :      *
     854                 :             :      *----------
     855                 :             :      */
     856                 :    24793029 :     START_CRIT_SECTION();
     857                 :             : 
     858         [ +  + ]:    24793029 :     if (likely(class == WALINSERT_NORMAL))
     859                 :             :     {
     860                 :    24791212 :         WALInsertLockAcquire();
     861                 :             : 
     862                 :             :         /*
     863                 :             :          * Check to see if my copy of RedoRecPtr is out of date. If so, may
     864                 :             :          * have to go back and have the caller recompute everything. This can
     865                 :             :          * only happen just after a checkpoint, so it's better to be slow in
     866                 :             :          * this case and fast otherwise.
     867                 :             :          *
     868                 :             :          * Also check to see if fullPageWrites was just turned on or there's a
     869                 :             :          * running backup (which forces full-page writes); if we weren't
     870                 :             :          * already doing full-page writes then go back and recompute.
     871                 :             :          *
     872                 :             :          * If we aren't doing full-page writes then RedoRecPtr doesn't
     873                 :             :          * actually affect the contents of the XLOG record, so we'll update
     874                 :             :          * our local copy but not force a recomputation.  (If doPageWrites was
     875                 :             :          * just turned off, we could recompute the record without full pages,
     876                 :             :          * but we choose not to bother.)
     877                 :             :          */
     878         [ +  + ]:    24791212 :         if (RedoRecPtr != Insert->RedoRecPtr)
     879                 :             :         {
     880                 :             :             Assert(RedoRecPtr < Insert->RedoRecPtr);
     881                 :        8128 :             RedoRecPtr = Insert->RedoRecPtr;
     882                 :             :         }
     883   [ +  +  +  + ]:    24791212 :         doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0);
     884                 :             : 
     885         [ +  + ]:    24791212 :         if (doPageWrites &&
     886   [ +  +  +  + ]:    22478695 :             (!prevDoPageWrites ||
     887         [ +  + ]:    21030052 :              (XLogRecPtrIsValid(fpw_lsn) && fpw_lsn <= RedoRecPtr)))
     888                 :             :         {
     889                 :             :             /*
     890                 :             :              * Oops, some buffer now needs to be backed up that the caller
     891                 :             :              * didn't back up.  Start over.
     892                 :             :              */
     893                 :        8800 :             WALInsertLockRelease();
     894                 :        8800 :             END_CRIT_SECTION();
     895                 :        8800 :             return InvalidXLogRecPtr;
     896                 :             :         }
     897                 :             : 
     898                 :             :         /*
     899                 :             :          * Reserve space for the record in the WAL. This also sets the xl_prev
     900                 :             :          * pointer.
     901                 :             :          */
     902                 :    24782412 :         ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
     903                 :             :                                   &rechdr->xl_prev);
     904                 :             : 
     905                 :             :         /* Normal records are always inserted. */
     906                 :    24782412 :         inserted = true;
     907                 :             :     }
     908         [ +  + ]:        1817 :     else if (class == WALINSERT_SPECIAL_SWITCH)
     909                 :             :     {
     910                 :             :         /*
     911                 :             :          * In order to insert an XLOG_SWITCH record, we need to hold all of
     912                 :             :          * the WAL insertion locks, not just one, so that no one else can
     913                 :             :          * begin inserting a record until we've figured out how much space
     914                 :             :          * remains in the current WAL segment and claimed all of it.
     915                 :             :          *
     916                 :             :          * Nonetheless, this case is simpler than the normal cases handled
     917                 :             :          * below, which must check for changes in doPageWrites and RedoRecPtr.
     918                 :             :          * Those checks are only needed for records that can contain buffer
     919                 :             :          * references, and an XLOG_SWITCH record never does.
     920                 :             :          */
     921                 :             :         Assert(!XLogRecPtrIsValid(fpw_lsn));
     922                 :         818 :         WALInsertLockAcquireExclusive();
     923                 :         818 :         inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
     924                 :             :     }
     925                 :             :     else
     926                 :             :     {
     927                 :             :         Assert(class == WALINSERT_SPECIAL_CHECKPOINT);
     928                 :             : 
     929                 :             :         /*
     930                 :             :          * We need to update both the local and shared copies of RedoRecPtr,
     931                 :             :          * which means that we need to hold all the WAL insertion locks.
     932                 :             :          * However, there can't be any buffer references, so as above, we need
     933                 :             :          * not check RedoRecPtr before inserting the record; we just need to
     934                 :             :          * update it afterwards.
     935                 :             :          */
     936                 :             :         Assert(!XLogRecPtrIsValid(fpw_lsn));
     937                 :         999 :         WALInsertLockAcquireExclusive();
     938                 :         999 :         ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
     939                 :             :                                   &rechdr->xl_prev);
     940                 :         999 :         RedoRecPtr = Insert->RedoRecPtr = StartPos;
     941                 :         999 :         inserted = true;
     942                 :             :     }
     943                 :             : 
     944         [ +  + ]:    24784229 :     if (inserted)
     945                 :             :     {
     946                 :             :         /*
     947                 :             :          * Now that xl_prev has been filled in, calculate CRC of the record
     948                 :             :          * header.
     949                 :             :          */
     950                 :    24784170 :         rdata_crc = rechdr->xl_crc;
     951                 :    24784170 :         COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
     952                 :    24784170 :         FIN_CRC32C(rdata_crc);
     953                 :    24784170 :         rechdr->xl_crc = rdata_crc;
     954                 :             : 
     955                 :             :         /*
     956                 :             :          * All the record data, including the header, is now ready to be
     957                 :             :          * inserted. Copy the record in the space reserved.
     958                 :             :          */
     959                 :    24784170 :         CopyXLogRecordToWAL(rechdr->xl_tot_len,
     960                 :             :                             class == WALINSERT_SPECIAL_SWITCH, rdata,
     961                 :             :                             StartPos, EndPos, insertTLI);
     962                 :             : 
     963                 :             :         /*
     964                 :             :          * Unless record is flagged as not important, update LSN of last
     965                 :             :          * important record in the current slot. When holding all locks, just
     966                 :             :          * update the first one.
     967                 :             :          */
     968         [ +  + ]:    24784170 :         if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
     969                 :             :         {
     970         [ +  + ]:    24623637 :             int         lockno = holdingAllLocks ? 0 : MyLockNo;
     971                 :             : 
     972                 :    24623637 :             WALInsertLocks[lockno].l.lastImportantAt = StartPos;
     973                 :             :         }
     974                 :             :     }
     975                 :             :     else
     976                 :             :     {
     977                 :             :         /*
     978                 :             :          * This was an xlog-switch record, but the current insert location was
     979                 :             :          * already exactly at the beginning of a segment, so there was no need
     980                 :             :          * to do anything.
     981                 :             :          */
     982                 :             :     }
     983                 :             : 
     984                 :             :     /*
     985                 :             :      * Done! Let others know that we're finished.
     986                 :             :      */
     987                 :    24784229 :     WALInsertLockRelease();
     988                 :             : 
     989                 :    24784229 :     END_CRIT_SECTION();
     990                 :             : 
     991                 :    24784229 :     MarkCurrentTransactionIdLoggedIfAny();
     992                 :             : 
     993                 :             :     /*
     994                 :             :      * Mark top transaction id is logged (if needed) so that we should not try
     995                 :             :      * to log it again with the next WAL record in the current subtransaction.
     996                 :             :      */
     997         [ +  + ]:    24784229 :     if (topxid_included)
     998                 :         223 :         MarkSubxactTopXidLogged();
     999                 :             : 
    1000                 :             :     /*
    1001                 :             :      * Update shared LogwrtRqst.Write, if we crossed page boundary.
    1002                 :             :      */
    1003         [ +  + ]:    24784229 :     if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
    1004                 :             :     {
    1005                 :     1897757 :         SpinLockAcquire(&XLogCtl->info_lck);
    1006                 :             :         /* advance global request to include new block(s) */
    1007         [ +  + ]:     1897757 :         if (XLogCtl->LogwrtRqst.Write < EndPos)
    1008                 :     1827779 :             XLogCtl->LogwrtRqst.Write = EndPos;
    1009                 :     1897757 :         SpinLockRelease(&XLogCtl->info_lck);
    1010                 :     1897757 :         RefreshXLogWriteResult(LogwrtResult);
    1011                 :             :     }
    1012                 :             : 
    1013                 :             :     /*
    1014                 :             :      * If this was an XLOG_SWITCH record, flush the record and the empty
    1015                 :             :      * padding space that fills the rest of the segment, and perform
    1016                 :             :      * end-of-segment actions (eg, notifying archiver).
    1017                 :             :      */
    1018         [ +  + ]:    24784229 :     if (class == WALINSERT_SPECIAL_SWITCH)
    1019                 :             :     {
    1020                 :             :         TRACE_POSTGRESQL_WAL_SWITCH();
    1021                 :         818 :         XLogFlush(EndPos);
    1022                 :             : 
    1023                 :             :         /*
    1024                 :             :          * Even though we reserved the rest of the segment for us, which is
    1025                 :             :          * reflected in EndPos, we return a pointer to just the end of the
    1026                 :             :          * xlog-switch record.
    1027                 :             :          */
    1028         [ +  + ]:         818 :         if (inserted)
    1029                 :             :         {
    1030                 :         759 :             EndPos = StartPos + SizeOfXLogRecord;
    1031         [ +  + ]:         759 :             if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
    1032                 :             :             {
    1033                 :           2 :                 uint64      offset = XLogSegmentOffset(EndPos, wal_segment_size);
    1034                 :             : 
    1035         [ -  + ]:           2 :                 if (offset == EndPos % XLOG_BLCKSZ)
    1036                 :           0 :                     EndPos += SizeOfXLogLongPHD;
    1037                 :             :                 else
    1038                 :           2 :                     EndPos += SizeOfXLogShortPHD;
    1039                 :             :             }
    1040                 :             :         }
    1041                 :             :     }
    1042                 :             : 
    1043                 :             : #ifdef WAL_DEBUG
    1044                 :             :     if (XLOG_DEBUG)
    1045                 :             :     {
    1046                 :             :         static XLogReaderState *debug_reader = NULL;
    1047                 :             :         XLogRecord *record;
    1048                 :             :         DecodedXLogRecord *decoded;
    1049                 :             :         StringInfoData buf;
    1050                 :             :         StringInfoData recordBuf;
    1051                 :             :         char       *errormsg = NULL;
    1052                 :             :         MemoryContext oldCxt;
    1053                 :             : 
    1054                 :             :         oldCxt = MemoryContextSwitchTo(walDebugCxt);
    1055                 :             : 
    1056                 :             :         initStringInfo(&buf);
    1057                 :             :         appendStringInfo(&buf, "INSERT @ %X/%08X: ", LSN_FORMAT_ARGS(EndPos));
    1058                 :             : 
    1059                 :             :         /*
    1060                 :             :          * We have to piece together the WAL record data from the XLogRecData
    1061                 :             :          * entries, so that we can pass it to the rm_desc function as one
    1062                 :             :          * contiguous chunk.
    1063                 :             :          */
    1064                 :             :         initStringInfo(&recordBuf);
    1065                 :             :         for (; rdata != NULL; rdata = rdata->next)
    1066                 :             :             appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
    1067                 :             : 
    1068                 :             :         /* We also need temporary space to decode the record. */
    1069                 :             :         record = (XLogRecord *) recordBuf.data;
    1070                 :             :         decoded = (DecodedXLogRecord *)
    1071                 :             :             palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
    1072                 :             : 
    1073                 :             :         if (!debug_reader)
    1074                 :             :             debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
    1075                 :             :                                               XL_ROUTINE(.page_read = NULL,
    1076                 :             :                                                          .segment_open = NULL,
    1077                 :             :                                                          .segment_close = NULL),
    1078                 :             :                                               NULL);
    1079                 :             :         if (!debug_reader)
    1080                 :             :         {
    1081                 :             :             appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
    1082                 :             :         }
    1083                 :             :         else if (!DecodeXLogRecord(debug_reader,
    1084                 :             :                                    decoded,
    1085                 :             :                                    record,
    1086                 :             :                                    EndPos,
    1087                 :             :                                    &errormsg))
    1088                 :             :         {
    1089                 :             :             appendStringInfo(&buf, "error decoding record: %s",
    1090                 :             :                              errormsg ? errormsg : "no error message");
    1091                 :             :         }
    1092                 :             :         else
    1093                 :             :         {
    1094                 :             :             appendStringInfoString(&buf, " - ");
    1095                 :             : 
    1096                 :             :             debug_reader->record = decoded;
    1097                 :             :             xlog_outdesc(&buf, debug_reader);
    1098                 :             :             debug_reader->record = NULL;
    1099                 :             :         }
    1100                 :             :         elog(LOG, "%s", buf.data);
    1101                 :             : 
    1102                 :             :         pfree(decoded);
    1103                 :             :         pfree(buf.data);
    1104                 :             :         pfree(recordBuf.data);
    1105                 :             :         MemoryContextSwitchTo(oldCxt);
    1106                 :             :     }
    1107                 :             : #endif
    1108                 :             : 
    1109                 :             :     /*
    1110                 :             :      * Update our global variables
    1111                 :             :      */
    1112                 :    24784229 :     ProcLastRecPtr = StartPos;
    1113                 :    24784229 :     XactLastRecEnd = EndPos;
    1114                 :             : 
    1115                 :             :     /* Report WAL traffic to the instrumentation. */
    1116         [ +  + ]:    24784229 :     if (inserted)
    1117                 :             :     {
    1118                 :    24784170 :         pgWalUsage.wal_bytes += rechdr->xl_tot_len;
    1119                 :    24784170 :         pgWalUsage.wal_records++;
    1120                 :    24784170 :         pgWalUsage.wal_fpi += num_fpi;
    1121                 :    24784170 :         pgWalUsage.wal_fpi_bytes += fpi_bytes;
    1122                 :             : 
    1123                 :             :         /* Required for the flush of pending stats WAL data */
    1124                 :    24784170 :         pgstat_report_fixed = true;
    1125                 :             :     }
    1126                 :             : 
    1127                 :    24784229 :     return EndPos;
    1128                 :             : }
    1129                 :             : 
    1130                 :             : /*
    1131                 :             :  * Reserves the right amount of space for a record of given size from the WAL.
    1132                 :             :  * *StartPos is set to the beginning of the reserved section, *EndPos to
    1133                 :             :  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
    1134                 :             :  * used to set the xl_prev of this record.
    1135                 :             :  *
    1136                 :             :  * This is the performance critical part of XLogInsert that must be serialized
    1137                 :             :  * across backends. The rest can happen mostly in parallel. Try to keep this
    1138                 :             :  * section as short as possible, insertpos_lck can be heavily contended on a
    1139                 :             :  * busy system.
    1140                 :             :  *
    1141                 :             :  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
    1142                 :             :  * where we actually copy the record to the reserved space.
    1143                 :             :  *
    1144                 :             :  * NB: Testing shows that XLogInsertRecord runs faster if this code is inlined;
    1145                 :             :  * however, because there are two call sites, the compiler is reluctant to
    1146                 :             :  * inline. We use pg_attribute_always_inline here to try to convince it.
    1147                 :             :  */
    1148                 :             : static pg_attribute_always_inline void
    1149                 :    24783411 : ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
    1150                 :             :                           XLogRecPtr *PrevPtr)
    1151                 :             : {
    1152                 :    24783411 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1153                 :             :     uint64      startbytepos;
    1154                 :             :     uint64      endbytepos;
    1155                 :             :     uint64      prevbytepos;
    1156                 :             : 
    1157                 :    24783411 :     size = MAXALIGN(size);
    1158                 :             : 
    1159                 :             :     /* All (non xlog-switch) records should contain data. */
    1160                 :             :     Assert(size > SizeOfXLogRecord);
    1161                 :             : 
    1162                 :             :     /*
    1163                 :             :      * The duration the spinlock needs to be held is minimized by minimizing
    1164                 :             :      * the calculations that have to be done while holding the lock. The
    1165                 :             :      * current tip of reserved WAL is kept in CurrBytePos, as a byte position
    1166                 :             :      * that only counts "usable" bytes in WAL, that is, it excludes all WAL
    1167                 :             :      * page headers. The mapping between "usable" byte positions and physical
    1168                 :             :      * positions (XLogRecPtrs) can be done outside the locked region, and
    1169                 :             :      * because the usable byte position doesn't include any headers, reserving
    1170                 :             :      * X bytes from WAL is almost as simple as "CurrBytePos += X".
    1171                 :             :      */
    1172                 :    24783411 :     SpinLockAcquire(&Insert->insertpos_lck);
    1173                 :             : 
    1174                 :    24783411 :     startbytepos = Insert->CurrBytePos;
    1175                 :    24783411 :     endbytepos = startbytepos + size;
    1176                 :    24783411 :     prevbytepos = Insert->PrevBytePos;
    1177                 :    24783411 :     Insert->CurrBytePos = endbytepos;
    1178                 :    24783411 :     Insert->PrevBytePos = startbytepos;
    1179                 :             : 
    1180                 :    24783411 :     SpinLockRelease(&Insert->insertpos_lck);
    1181                 :             : 
    1182                 :    24783411 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1183                 :    24783411 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1184                 :    24783411 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1185                 :             : 
    1186                 :             :     /*
    1187                 :             :      * Check that the conversions between "usable byte positions" and
    1188                 :             :      * XLogRecPtrs work consistently in both directions.
    1189                 :             :      */
    1190                 :             :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1191                 :             :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1192                 :             :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1193                 :    24783411 : }
    1194                 :             : 
    1195                 :             : /*
    1196                 :             :  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
    1197                 :             :  *
    1198                 :             :  * A log-switch record is handled slightly differently. The rest of the
    1199                 :             :  * segment will be reserved for this insertion, as indicated by the returned
    1200                 :             :  * *EndPos value. However, if we are already at the beginning of the current
    1201                 :             :  * segment, *StartPos and *EndPos are set to the current location without
    1202                 :             :  * reserving any space, and the function returns false.
    1203                 :             :  */
    1204                 :             : static bool
    1205                 :         818 : ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
    1206                 :             : {
    1207                 :         818 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1208                 :             :     uint64      startbytepos;
    1209                 :             :     uint64      endbytepos;
    1210                 :             :     uint64      prevbytepos;
    1211                 :         818 :     uint32      size = MAXALIGN(SizeOfXLogRecord);
    1212                 :             :     XLogRecPtr  ptr;
    1213                 :             :     uint32      segleft;
    1214                 :             : 
    1215                 :             :     /*
    1216                 :             :      * These calculations are a bit heavy-weight to be done while holding a
    1217                 :             :      * spinlock, but since we're holding all the WAL insertion locks, there
    1218                 :             :      * are no other inserters competing for it. GetXLogInsertRecPtr() does
    1219                 :             :      * compete for it, but that's not called very frequently.
    1220                 :             :      */
    1221                 :         818 :     SpinLockAcquire(&Insert->insertpos_lck);
    1222                 :             : 
    1223                 :         818 :     startbytepos = Insert->CurrBytePos;
    1224                 :             : 
    1225                 :         818 :     ptr = XLogBytePosToEndRecPtr(startbytepos);
    1226         [ +  + ]:         818 :     if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
    1227                 :             :     {
    1228                 :          59 :         SpinLockRelease(&Insert->insertpos_lck);
    1229                 :          59 :         *EndPos = *StartPos = ptr;
    1230                 :          59 :         return false;
    1231                 :             :     }
    1232                 :             : 
    1233                 :         759 :     endbytepos = startbytepos + size;
    1234                 :         759 :     prevbytepos = Insert->PrevBytePos;
    1235                 :             : 
    1236                 :         759 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1237                 :         759 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1238                 :             : 
    1239                 :         759 :     segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
    1240         [ +  - ]:         759 :     if (segleft != wal_segment_size)
    1241                 :             :     {
    1242                 :             :         /* consume the rest of the segment */
    1243                 :         759 :         *EndPos += segleft;
    1244                 :         759 :         endbytepos = XLogRecPtrToBytePos(*EndPos);
    1245                 :             :     }
    1246                 :         759 :     Insert->CurrBytePos = endbytepos;
    1247                 :         759 :     Insert->PrevBytePos = startbytepos;
    1248                 :             : 
    1249                 :         759 :     SpinLockRelease(&Insert->insertpos_lck);
    1250                 :             : 
    1251                 :         759 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1252                 :             : 
    1253                 :             :     Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
    1254                 :             :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1255                 :             :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1256                 :             :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1257                 :             : 
    1258                 :         759 :     return true;
    1259                 :             : }
    1260                 :             : 
    1261                 :             : /*
    1262                 :             :  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
    1263                 :             :  * area in the WAL.
    1264                 :             :  */
    1265                 :             : static void
    1266                 :    24784170 : CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
    1267                 :             :                     XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
    1268                 :             : {
    1269                 :             :     char       *currpos;
    1270                 :             :     int         freespace;
    1271                 :             :     int         written;
    1272                 :             :     XLogRecPtr  CurrPos;
    1273                 :             :     XLogPageHeader pagehdr;
    1274                 :             : 
    1275                 :             :     /*
    1276                 :             :      * Get a pointer to the right place in the right WAL buffer to start
    1277                 :             :      * inserting to.
    1278                 :             :      */
    1279                 :    24784170 :     CurrPos = StartPos;
    1280                 :    24784170 :     currpos = GetXLogBuffer(CurrPos, tli);
    1281         [ +  - ]:    24784170 :     freespace = INSERT_FREESPACE(CurrPos);
    1282                 :             : 
    1283                 :             :     /*
    1284                 :             :      * there should be enough space for at least the first field (xl_tot_len)
    1285                 :             :      * on this page.
    1286                 :             :      */
    1287                 :             :     Assert(freespace >= sizeof(uint32));
    1288                 :             : 
    1289                 :             :     /* Copy record data */
    1290                 :    24784170 :     written = 0;
    1291         [ +  + ]:   113097609 :     while (rdata != NULL)
    1292                 :             :     {
    1293                 :    88313439 :         const char *rdata_data = rdata->data;
    1294                 :    88313439 :         int         rdata_len = rdata->len;
    1295                 :             : 
    1296         [ +  + ]:    90321487 :         while (rdata_len > freespace)
    1297                 :             :         {
    1298                 :             :             /*
    1299                 :             :              * Write what fits on this page, and continue on the next page.
    1300                 :             :              */
    1301                 :             :             Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
    1302                 :     2008048 :             memcpy(currpos, rdata_data, freespace);
    1303                 :     2008048 :             rdata_data += freespace;
    1304                 :     2008048 :             rdata_len -= freespace;
    1305                 :     2008048 :             written += freespace;
    1306                 :     2008048 :             CurrPos += freespace;
    1307                 :             : 
    1308                 :             :             /*
    1309                 :             :              * Get pointer to beginning of next page, and set the xlp_rem_len
    1310                 :             :              * in the page header. Set XLP_FIRST_IS_CONTRECORD.
    1311                 :             :              *
    1312                 :             :              * It's safe to set the contrecord flag and xlp_rem_len without a
    1313                 :             :              * lock on the page. All the other flags were already set when the
    1314                 :             :              * page was initialized, in AdvanceXLInsertBuffer, and we're the
    1315                 :             :              * only backend that needs to set the contrecord flag.
    1316                 :             :              */
    1317                 :     2008048 :             currpos = GetXLogBuffer(CurrPos, tli);
    1318                 :     2008048 :             pagehdr = (XLogPageHeader) currpos;
    1319                 :     2008048 :             pagehdr->xlp_rem_len = write_len - written;
    1320                 :     2008048 :             pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
    1321                 :             : 
    1322                 :             :             /* skip over the page header */
    1323         [ +  + ]:     2008048 :             if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
    1324                 :             :             {
    1325                 :        1283 :                 CurrPos += SizeOfXLogLongPHD;
    1326                 :        1283 :                 currpos += SizeOfXLogLongPHD;
    1327                 :             :             }
    1328                 :             :             else
    1329                 :             :             {
    1330                 :     2006765 :                 CurrPos += SizeOfXLogShortPHD;
    1331                 :     2006765 :                 currpos += SizeOfXLogShortPHD;
    1332                 :             :             }
    1333         [ +  - ]:     2008048 :             freespace = INSERT_FREESPACE(CurrPos);
    1334                 :             :         }
    1335                 :             : 
    1336                 :             :         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
    1337                 :    88313439 :         memcpy(currpos, rdata_data, rdata_len);
    1338                 :    88313439 :         currpos += rdata_len;
    1339                 :    88313439 :         CurrPos += rdata_len;
    1340                 :    88313439 :         freespace -= rdata_len;
    1341                 :    88313439 :         written += rdata_len;
    1342                 :             : 
    1343                 :    88313439 :         rdata = rdata->next;
    1344                 :             :     }
    1345                 :             :     Assert(written == write_len);
    1346                 :             : 
    1347                 :             :     /*
    1348                 :             :      * If this was an xlog-switch, it's not enough to write the switch record,
    1349                 :             :      * we also have to consume all the remaining space in the WAL segment.  We
    1350                 :             :      * have already reserved that space, but we need to actually fill it.
    1351                 :             :      */
    1352   [ +  +  +  - ]:    24784170 :     if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
    1353                 :             :     {
    1354                 :             :         /* An xlog-switch record doesn't contain any data besides the header */
    1355                 :             :         Assert(write_len == SizeOfXLogRecord);
    1356                 :             : 
    1357                 :             :         /* Assert that we did reserve the right amount of space */
    1358                 :             :         Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
    1359                 :             : 
    1360                 :             :         /* Use up all the remaining space on the current page */
    1361                 :         759 :         CurrPos += freespace;
    1362                 :             : 
    1363                 :             :         /*
    1364                 :             :          * Cause all remaining pages in the segment to be flushed, leaving the
    1365                 :             :          * XLog position where it should be, at the start of the next segment.
    1366                 :             :          * We do this one page at a time, to make sure we don't deadlock
    1367                 :             :          * against ourselves if wal_buffers < wal_segment_size.
    1368                 :             :          */
    1369         [ +  + ]:      760775 :         while (CurrPos < EndPos)
    1370                 :             :         {
    1371                 :             :             /*
    1372                 :             :              * The minimal action to flush the page would be to call
    1373                 :             :              * WALInsertLockUpdateInsertingAt(CurrPos) followed by
    1374                 :             :              * AdvanceXLInsertBuffer(...).  The page would be left initialized
    1375                 :             :              * mostly to zeros, except for the page header (always the short
    1376                 :             :              * variant, as this is never a segment's first page).
    1377                 :             :              *
    1378                 :             :              * The large vistas of zeros are good for compressibility, but the
    1379                 :             :              * headers interrupting them every XLOG_BLCKSZ (with values that
    1380                 :             :              * differ from page to page) are not.  The effect varies with
    1381                 :             :              * compression tool, but bzip2 for instance compresses about an
    1382                 :             :              * order of magnitude worse if those headers are left in place.
    1383                 :             :              *
    1384                 :             :              * Rather than complicating AdvanceXLInsertBuffer itself (which is
    1385                 :             :              * called in heavily-loaded circumstances as well as this lightly-
    1386                 :             :              * loaded one) with variant behavior, we just use GetXLogBuffer
    1387                 :             :              * (which itself calls the two methods we need) to get the pointer
    1388                 :             :              * and zero most of the page.  Then we just zero the page header.
    1389                 :             :              */
    1390                 :      760016 :             currpos = GetXLogBuffer(CurrPos, tli);
    1391   [ +  -  +  -  :     3040064 :             MemSet(currpos, 0, SizeOfXLogShortPHD);
          +  -  +  -  +  
                      + ]
    1392                 :             : 
    1393                 :      760016 :             CurrPos += XLOG_BLCKSZ;
    1394                 :             :         }
    1395                 :             :     }
    1396                 :             :     else
    1397                 :             :     {
    1398                 :             :         /* Align the end position, so that the next record starts aligned */
    1399                 :    24783411 :         CurrPos = MAXALIGN64(CurrPos);
    1400                 :             :     }
    1401                 :             : 
    1402         [ -  + ]:    24784170 :     if (CurrPos != EndPos)
    1403         [ #  # ]:           0 :         ereport(PANIC,
    1404                 :             :                 errcode(ERRCODE_DATA_CORRUPTED),
    1405                 :             :                 errmsg_internal("space reserved for WAL record does not match what was written"));
    1406                 :    24784170 : }
    1407                 :             : 
    1408                 :             : /*
    1409                 :             :  * Acquire a WAL insertion lock, for inserting to WAL.
    1410                 :             :  */
    1411                 :             : static void
    1412                 :    24792222 : WALInsertLockAcquire(void)
    1413                 :             : {
    1414                 :             :     bool        immed;
    1415                 :             : 
    1416                 :             :     /*
    1417                 :             :      * It doesn't matter which of the WAL insertion locks we acquire, so try
    1418                 :             :      * the one we used last time.  If the system isn't particularly busy, it's
    1419                 :             :      * a good bet that it's still available, and it's good to have some
    1420                 :             :      * affinity to a particular lock so that you don't unnecessarily bounce
    1421                 :             :      * cache lines between processes when there's no contention.
    1422                 :             :      *
    1423                 :             :      * If this is the first time through in this backend, pick a lock
    1424                 :             :      * (semi-)randomly.  This allows the locks to be used evenly if you have a
    1425                 :             :      * lot of very short connections.
    1426                 :             :      */
    1427                 :             :     static int  lockToTry = -1;
    1428                 :             : 
    1429         [ +  + ]:    24792222 :     if (lockToTry == -1)
    1430                 :        9283 :         lockToTry = MyProcNumber % NUM_XLOGINSERT_LOCKS;
    1431                 :    24792222 :     MyLockNo = lockToTry;
    1432                 :             : 
    1433                 :             :     /*
    1434                 :             :      * The insertingAt value is initially set to 0, as we don't know our
    1435                 :             :      * insert location yet.
    1436                 :             :      */
    1437                 :    24792222 :     immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
    1438         [ +  + ]:    24792222 :     if (!immed)
    1439                 :             :     {
    1440                 :             :         /*
    1441                 :             :          * If we couldn't get the lock immediately, try another lock next
    1442                 :             :          * time.  On a system with more insertion locks than concurrent
    1443                 :             :          * inserters, this causes all the inserters to eventually migrate to a
    1444                 :             :          * lock that no-one else is using.  On a system with more inserters
    1445                 :             :          * than locks, it still helps to distribute the inserters evenly
    1446                 :             :          * across the locks.
    1447                 :             :          */
    1448                 :       16536 :         lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
    1449                 :             :     }
    1450                 :    24792222 : }
    1451                 :             : 
    1452                 :             : /*
    1453                 :             :  * Acquire all WAL insertion locks, to prevent other backends from inserting
    1454                 :             :  * to WAL.
    1455                 :             :  */
    1456                 :             : static void
    1457                 :        4847 : WALInsertLockAcquireExclusive(void)
    1458                 :             : {
    1459                 :             :     int         i;
    1460                 :             : 
    1461                 :             :     /*
    1462                 :             :      * When holding all the locks, all but the last lock's insertingAt
    1463                 :             :      * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
    1464                 :             :      * XLogRecPtr value, to make sure that no-one blocks waiting on those.
    1465                 :             :      */
    1466         [ +  + ]:       38776 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
    1467                 :             :     {
    1468                 :       33929 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1469                 :       33929 :         LWLockUpdateVar(&WALInsertLocks[i].l.lock,
    1470                 :       33929 :                         &WALInsertLocks[i].l.insertingAt,
    1471                 :             :                         PG_UINT64_MAX);
    1472                 :             :     }
    1473                 :             :     /* Variable value reset to 0 at release */
    1474                 :        4847 :     LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1475                 :             : 
    1476                 :        4847 :     holdingAllLocks = true;
    1477                 :        4847 : }
    1478                 :             : 
    1479                 :             : /*
    1480                 :             :  * Release our insertion lock (or locks, if we're holding them all).
    1481                 :             :  *
    1482                 :             :  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
    1483                 :             :  * next time the lock is acquired.
    1484                 :             :  */
    1485                 :             : static void
    1486                 :    24797069 : WALInsertLockRelease(void)
    1487                 :             : {
    1488         [ +  + ]:    24797069 :     if (holdingAllLocks)
    1489                 :             :     {
    1490                 :             :         int         i;
    1491                 :             : 
    1492         [ +  + ]:       43623 :         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1493                 :       38776 :             LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
    1494                 :       38776 :                                   &WALInsertLocks[i].l.insertingAt,
    1495                 :             :                                   0);
    1496                 :             : 
    1497                 :        4847 :         holdingAllLocks = false;
    1498                 :             :     }
    1499                 :             :     else
    1500                 :             :     {
    1501                 :    24792222 :         LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
    1502                 :    24792222 :                               &WALInsertLocks[MyLockNo].l.insertingAt,
    1503                 :             :                               0);
    1504                 :             :     }
    1505                 :    24797069 : }
    1506                 :             : 
    1507                 :             : /*
    1508                 :             :  * Update our insertingAt value, to let others know that we've finished
    1509                 :             :  * inserting up to that point.
    1510                 :             :  */
    1511                 :             : static void
    1512                 :     2629755 : WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
    1513                 :             : {
    1514         [ +  + ]:     2629755 :     if (holdingAllLocks)
    1515                 :             :     {
    1516                 :             :         /*
    1517                 :             :          * We use the last lock to mark our actual position, see comments in
    1518                 :             :          * WALInsertLockAcquireExclusive.
    1519                 :             :          */
    1520                 :      754673 :         LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
    1521                 :      754673 :                         &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
    1522                 :             :                         insertingAt);
    1523                 :             :     }
    1524                 :             :     else
    1525                 :     1875082 :         LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
    1526                 :     1875082 :                         &WALInsertLocks[MyLockNo].l.insertingAt,
    1527                 :             :                         insertingAt);
    1528                 :     2629755 : }
    1529                 :             : 
    1530                 :             : /*
    1531                 :             :  * Wait for any WAL insertions < upto to finish.
    1532                 :             :  *
    1533                 :             :  * Returns the location of the oldest insertion that is still in-progress.
    1534                 :             :  * Any WAL prior to that point has been fully copied into WAL buffers, and
    1535                 :             :  * can be flushed out to disk. Because this waits for any insertions older
    1536                 :             :  * than 'upto' to finish, the return value is always >= 'upto'.
    1537                 :             :  *
    1538                 :             :  * Note: When you are about to write out WAL, you must call this function
    1539                 :             :  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
    1540                 :             :  * need to wait for an insertion to finish (or at least advance to next
    1541                 :             :  * uninitialized page), and the inserter might need to evict an old WAL buffer
    1542                 :             :  * to make room for a new one, which in turn requires WALWriteLock.
    1543                 :             :  */
    1544                 :             : static XLogRecPtr
    1545                 :     2500337 : WaitXLogInsertionsToFinish(XLogRecPtr upto)
    1546                 :             : {
    1547                 :             :     uint64      bytepos;
    1548                 :             :     XLogRecPtr  inserted;
    1549                 :             :     XLogRecPtr  reservedUpto;
    1550                 :             :     XLogRecPtr  finishedUpto;
    1551                 :     2500337 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1552                 :             :     int         i;
    1553                 :             : 
    1554         [ -  + ]:     2500337 :     if (MyProc == NULL)
    1555         [ #  # ]:           0 :         elog(PANIC, "cannot wait without a PGPROC structure");
    1556                 :             : 
    1557                 :             :     /*
    1558                 :             :      * Check if there's any work to do.  Use a barrier to ensure we get the
    1559                 :             :      * freshest value.
    1560                 :             :      */
    1561                 :     2500337 :     inserted = pg_atomic_read_membarrier_u64(&XLogCtl->logInsertResult);
    1562         [ +  + ]:     2500337 :     if (upto <= inserted)
    1563                 :     1959664 :         return inserted;
    1564                 :             : 
    1565                 :             :     /* Read the current insert position */
    1566                 :      540673 :     SpinLockAcquire(&Insert->insertpos_lck);
    1567                 :      540673 :     bytepos = Insert->CurrBytePos;
    1568                 :      540673 :     SpinLockRelease(&Insert->insertpos_lck);
    1569                 :      540673 :     reservedUpto = XLogBytePosToEndRecPtr(bytepos);
    1570                 :             : 
    1571                 :             :     /*
    1572                 :             :      * No-one should request to flush a piece of WAL that hasn't even been
    1573                 :             :      * reserved yet. However, it can happen if there is a block with a bogus
    1574                 :             :      * LSN on disk, for example. XLogFlush checks for that situation and
    1575                 :             :      * complains, but only after the flush. Here we just assume that to mean
    1576                 :             :      * that all WAL that has been reserved needs to be finished. In this
    1577                 :             :      * corner-case, the return value can be smaller than 'upto' argument.
    1578                 :             :      */
    1579         [ -  + ]:      540673 :     if (upto > reservedUpto)
    1580                 :             :     {
    1581         [ #  # ]:           0 :         ereport(LOG,
    1582                 :             :                 errmsg("request to flush past end of generated WAL; request %X/%08X, current position %X/%08X",
    1583                 :             :                        LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto)));
    1584                 :           0 :         upto = reservedUpto;
    1585                 :             :     }
    1586                 :             : 
    1587                 :             :     /*
    1588                 :             :      * Loop through all the locks, sleeping on any in-progress insert older
    1589                 :             :      * than 'upto'.
    1590                 :             :      *
    1591                 :             :      * finishedUpto is our return value, indicating the point upto which all
    1592                 :             :      * the WAL insertions have been finished. Initialize it to the head of
    1593                 :             :      * reserved WAL, and as we iterate through the insertion locks, back it
    1594                 :             :      * out for any insertion that's still in progress.
    1595                 :             :      */
    1596                 :      540673 :     finishedUpto = reservedUpto;
    1597         [ +  + ]:     4866057 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1598                 :             :     {
    1599                 :     4325384 :         XLogRecPtr  insertingat = InvalidXLogRecPtr;
    1600                 :             : 
    1601                 :             :         do
    1602                 :             :         {
    1603                 :             :             /*
    1604                 :             :              * See if this insertion is in progress.  LWLockWaitForVar will
    1605                 :             :              * wait for the lock to be released, or for the 'value' to be set
    1606                 :             :              * by a LWLockUpdateVar call.  When a lock is initially acquired,
    1607                 :             :              * its value is 0 (InvalidXLogRecPtr), which means that we don't
    1608                 :             :              * know where it's inserting yet.  We will have to wait for it. If
    1609                 :             :              * it's a small insertion, the record will most likely fit on the
    1610                 :             :              * same page and the inserter will release the lock without ever
    1611                 :             :              * calling LWLockUpdateVar.  But if it has to sleep, it will
    1612                 :             :              * advertise the insertion point with LWLockUpdateVar before
    1613                 :             :              * sleeping.
    1614                 :             :              *
    1615                 :             :              * In this loop we are only waiting for insertions that started
    1616                 :             :              * before WaitXLogInsertionsToFinish was called.  The lack of
    1617                 :             :              * memory barriers in the loop means that we might see locks as
    1618                 :             :              * "unused" that have since become used.  This is fine because
    1619                 :             :              * they only can be used for later insertions that we would not
    1620                 :             :              * want to wait on anyway.  Not taking a lock to acquire the
    1621                 :             :              * current insertingAt value means that we might see older
    1622                 :             :              * insertingAt values.  This is also fine, because if we read a
    1623                 :             :              * value too old, we will add ourselves to the wait queue, which
    1624                 :             :              * contains atomic operations.
    1625                 :             :              */
    1626         [ +  + ]:     4459731 :             if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
    1627                 :     4459731 :                                  &WALInsertLocks[i].l.insertingAt,
    1628                 :             :                                  insertingat, &insertingat))
    1629                 :             :             {
    1630                 :             :                 /* the lock was free, so no insertion in progress */
    1631                 :     3036314 :                 insertingat = InvalidXLogRecPtr;
    1632                 :     3036314 :                 break;
    1633                 :             :             }
    1634                 :             : 
    1635                 :             :             /*
    1636                 :             :              * This insertion is still in progress. Have to wait, unless the
    1637                 :             :              * inserter has proceeded past 'upto'.
    1638                 :             :              */
    1639         [ +  + ]:     1423417 :         } while (insertingat < upto);
    1640                 :             : 
    1641   [ +  +  +  + ]:     4325384 :         if (XLogRecPtrIsValid(insertingat) && insertingat < finishedUpto)
    1642                 :      454408 :             finishedUpto = insertingat;
    1643                 :             :     }
    1644                 :             : 
    1645                 :             :     /*
    1646                 :             :      * Advance the limit we know to have been inserted and return the freshest
    1647                 :             :      * value we know of, which might be beyond what we requested if somebody
    1648                 :             :      * is concurrently doing this with an 'upto' pointer ahead of us.
    1649                 :             :      */
    1650                 :      540673 :     finishedUpto = pg_atomic_monotonic_advance_u64(&XLogCtl->logInsertResult,
    1651                 :             :                                                    finishedUpto);
    1652                 :             : 
    1653                 :      540673 :     return finishedUpto;
    1654                 :             : }
    1655                 :             : 
    1656                 :             : /*
    1657                 :             :  * Get a pointer to the right location in the WAL buffer containing the
    1658                 :             :  * given XLogRecPtr.
    1659                 :             :  *
    1660                 :             :  * If the page is not initialized yet, it is initialized. That might require
    1661                 :             :  * evicting an old dirty buffer from the buffer cache, which means I/O.
    1662                 :             :  *
    1663                 :             :  * The caller must ensure that the page containing the requested location
    1664                 :             :  * isn't evicted yet, and won't be evicted. The way to ensure that is to
    1665                 :             :  * hold onto a WAL insertion lock with the insertingAt position set to
    1666                 :             :  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
    1667                 :             :  * to evict an old page from the buffer. (This means that once you call
    1668                 :             :  * GetXLogBuffer() with a given 'ptr', you must not access anything before
    1669                 :             :  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
    1670                 :             :  * later, because older buffers might be recycled already)
    1671                 :             :  */
    1672                 :             : static char *
    1673                 :    27552245 : GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
    1674                 :             : {
    1675                 :             :     int         idx;
    1676                 :             :     XLogRecPtr  endptr;
    1677                 :             :     static uint64 cachedPage = 0;
    1678                 :             :     static char *cachedPos = NULL;
    1679                 :             :     XLogRecPtr  expectedEndPtr;
    1680                 :             : 
    1681                 :             :     /*
    1682                 :             :      * Fast path for the common case that we need to access again the same
    1683                 :             :      * page as last time.
    1684                 :             :      */
    1685         [ +  + ]:    27552245 :     if (ptr / XLOG_BLCKSZ == cachedPage)
    1686                 :             :     {
    1687                 :             :         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1688                 :             :         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1689                 :    24443113 :         return cachedPos + ptr % XLOG_BLCKSZ;
    1690                 :             :     }
    1691                 :             : 
    1692                 :             :     /*
    1693                 :             :      * The XLog buffer cache is organized so that a page is always loaded to a
    1694                 :             :      * particular buffer.  That way we can easily calculate the buffer a given
    1695                 :             :      * page must be loaded into, from the XLogRecPtr alone.
    1696                 :             :      */
    1697                 :     3109132 :     idx = XLogRecPtrToBufIdx(ptr);
    1698                 :             : 
    1699                 :             :     /*
    1700                 :             :      * See what page is loaded in the buffer at the moment. It could be the
    1701                 :             :      * page we're looking for, or something older. It can't be anything newer
    1702                 :             :      * - that would imply the page we're looking for has already been written
    1703                 :             :      * out to disk and evicted, and the caller is responsible for making sure
    1704                 :             :      * that doesn't happen.
    1705                 :             :      *
    1706                 :             :      * We don't hold a lock while we read the value. If someone is just about
    1707                 :             :      * to initialize or has just initialized the page, it's possible that we
    1708                 :             :      * get InvalidXLogRecPtr. That's ok, we'll grab the mapping lock (in
    1709                 :             :      * AdvanceXLInsertBuffer) and retry if we see anything other than the page
    1710                 :             :      * we're looking for.
    1711                 :             :      */
    1712                 :     3109132 :     expectedEndPtr = ptr;
    1713                 :     3109132 :     expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
    1714                 :             : 
    1715                 :     3109132 :     endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1716         [ +  + ]:     3109132 :     if (expectedEndPtr != endptr)
    1717                 :             :     {
    1718                 :             :         XLogRecPtr  initializedUpto;
    1719                 :             : 
    1720                 :             :         /*
    1721                 :             :          * Before calling AdvanceXLInsertBuffer(), which can block, let others
    1722                 :             :          * know how far we're finished with inserting the record.
    1723                 :             :          *
    1724                 :             :          * NB: If 'ptr' points to just after the page header, advertise a
    1725                 :             :          * position at the beginning of the page rather than 'ptr' itself. If
    1726                 :             :          * there are no other insertions running, someone might try to flush
    1727                 :             :          * up to our advertised location. If we advertised a position after
    1728                 :             :          * the page header, someone might try to flush the page header, even
    1729                 :             :          * though page might actually not be initialized yet. As the first
    1730                 :             :          * inserter on the page, we are effectively responsible for making
    1731                 :             :          * sure that it's initialized, before we let insertingAt to move past
    1732                 :             :          * the page header.
    1733                 :             :          */
    1734         [ +  + ]:     2629755 :         if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
    1735         [ +  - ]:       11860 :             XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
    1736                 :       11860 :             initializedUpto = ptr - SizeOfXLogShortPHD;
    1737         [ +  + ]:     2617895 :         else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
    1738         [ +  + ]:         896 :                  XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
    1739                 :         633 :             initializedUpto = ptr - SizeOfXLogLongPHD;
    1740                 :             :         else
    1741                 :     2617262 :             initializedUpto = ptr;
    1742                 :             : 
    1743                 :     2629755 :         WALInsertLockUpdateInsertingAt(initializedUpto);
    1744                 :             : 
    1745                 :     2629755 :         AdvanceXLInsertBuffer(ptr, tli, false);
    1746                 :     2629755 :         endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1747                 :             : 
    1748         [ -  + ]:     2629755 :         if (expectedEndPtr != endptr)
    1749         [ #  # ]:           0 :             elog(PANIC, "could not find WAL buffer for %X/%08X",
    1750                 :             :                  LSN_FORMAT_ARGS(ptr));
    1751                 :             :     }
    1752                 :             :     else
    1753                 :             :     {
    1754                 :             :         /*
    1755                 :             :          * Make sure the initialization of the page is visible to us, and
    1756                 :             :          * won't arrive later to overwrite the WAL data we write on the page.
    1757                 :             :          */
    1758                 :      479377 :         pg_memory_barrier();
    1759                 :             :     }
    1760                 :             : 
    1761                 :             :     /*
    1762                 :             :      * Found the buffer holding this page. Return a pointer to the right
    1763                 :             :      * offset within the page.
    1764                 :             :      */
    1765                 :     3109132 :     cachedPage = ptr / XLOG_BLCKSZ;
    1766                 :     3109132 :     cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
    1767                 :             : 
    1768                 :             :     Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1769                 :             :     Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1770                 :             : 
    1771                 :     3109132 :     return cachedPos + ptr % XLOG_BLCKSZ;
    1772                 :             : }
    1773                 :             : 
    1774                 :             : /*
    1775                 :             :  * Read WAL data directly from WAL buffers, if available. Returns the number
    1776                 :             :  * of bytes read successfully.
    1777                 :             :  *
    1778                 :             :  * Fewer than 'count' bytes may be read if some of the requested WAL data has
    1779                 :             :  * already been evicted.
    1780                 :             :  *
    1781                 :             :  * No locks are taken.
    1782                 :             :  *
    1783                 :             :  * Caller should ensure that it reads no further than LogwrtResult.Write
    1784                 :             :  * (which should have been updated by the caller when determining how far to
    1785                 :             :  * read). The 'tli' argument is only used as a convenient safety check so that
    1786                 :             :  * callers do not read from WAL buffers on a historical timeline.
    1787                 :             :  */
    1788                 :             : Size
    1789                 :      106980 : WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count,
    1790                 :             :                    TimeLineID tli)
    1791                 :             : {
    1792                 :      106980 :     char       *pdst = dstbuf;
    1793                 :      106980 :     XLogRecPtr  recptr = startptr;
    1794                 :             :     XLogRecPtr  inserted;
    1795                 :      106980 :     Size        nbytes = count;
    1796                 :             : 
    1797   [ +  +  +  + ]:      106980 :     if (RecoveryInProgress() || tli != GetWALInsertionTimeLine())
    1798                 :        1085 :         return 0;
    1799                 :             : 
    1800                 :             :     Assert(XLogRecPtrIsValid(startptr));
    1801                 :             : 
    1802                 :             :     /*
    1803                 :             :      * Caller should ensure that the requested data has been inserted into WAL
    1804                 :             :      * buffers before we try to read it.
    1805                 :             :      */
    1806                 :      105895 :     inserted = pg_atomic_read_u64(&XLogCtl->logInsertResult);
    1807         [ -  + ]:      105895 :     if (startptr + count > inserted)
    1808         [ #  # ]:           0 :         ereport(ERROR,
    1809                 :             :                 errmsg("cannot read past end of generated WAL: requested %X/%08X, current position %X/%08X",
    1810                 :             :                        LSN_FORMAT_ARGS(startptr + count),
    1811                 :             :                        LSN_FORMAT_ARGS(inserted)));
    1812                 :             : 
    1813                 :             :     /*
    1814                 :             :      * Loop through the buffers without a lock. For each buffer, atomically
    1815                 :             :      * read and verify the end pointer, then copy the data out, and finally
    1816                 :             :      * re-read and re-verify the end pointer.
    1817                 :             :      *
    1818                 :             :      * Once a page is evicted, it never returns to the WAL buffers, so if the
    1819                 :             :      * end pointer matches the expected end pointer before and after we copy
    1820                 :             :      * the data, then the right page must have been present during the data
    1821                 :             :      * copy. Read barriers are necessary to ensure that the data copy actually
    1822                 :             :      * happens between the two verification steps.
    1823                 :             :      *
    1824                 :             :      * If either verification fails, we simply terminate the loop and return
    1825                 :             :      * with the data that had been already copied out successfully.
    1826                 :             :      */
    1827         [ +  + ]:      130430 :     while (nbytes > 0)
    1828                 :             :     {
    1829                 :      123259 :         uint32      offset = recptr % XLOG_BLCKSZ;
    1830                 :      123259 :         int         idx = XLogRecPtrToBufIdx(recptr);
    1831                 :             :         XLogRecPtr  expectedEndPtr;
    1832                 :             :         XLogRecPtr  endptr;
    1833                 :             :         const char *page;
    1834                 :             :         const char *psrc;
    1835                 :             :         Size        npagebytes;
    1836                 :             : 
    1837                 :             :         /*
    1838                 :             :          * Calculate the end pointer we expect in the xlblocks array if the
    1839                 :             :          * correct page is present.
    1840                 :             :          */
    1841                 :      123259 :         expectedEndPtr = recptr + (XLOG_BLCKSZ - offset);
    1842                 :             : 
    1843                 :             :         /*
    1844                 :             :          * First verification step: check that the correct page is present in
    1845                 :             :          * the WAL buffers.
    1846                 :             :          */
    1847                 :      123259 :         endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1848         [ +  + ]:      123259 :         if (expectedEndPtr != endptr)
    1849                 :       98723 :             break;
    1850                 :             : 
    1851                 :             :         /*
    1852                 :             :          * The correct page is present (or was at the time the endptr was
    1853                 :             :          * read; must re-verify later). Calculate pointer to source data and
    1854                 :             :          * determine how much data to read from this page.
    1855                 :             :          */
    1856                 :       24536 :         page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
    1857                 :       24536 :         psrc = page + offset;
    1858                 :       24536 :         npagebytes = Min(nbytes, XLOG_BLCKSZ - offset);
    1859                 :             : 
    1860                 :             :         /*
    1861                 :             :          * Ensure that the data copy and the first verification step are not
    1862                 :             :          * reordered.
    1863                 :             :          */
    1864                 :       24536 :         pg_read_barrier();
    1865                 :             : 
    1866                 :             :         /* data copy */
    1867                 :       24536 :         memcpy(pdst, psrc, npagebytes);
    1868                 :             : 
    1869                 :             :         /*
    1870                 :             :          * Ensure that the data copy and the second verification step are not
    1871                 :             :          * reordered.
    1872                 :             :          */
    1873                 :       24536 :         pg_read_barrier();
    1874                 :             : 
    1875                 :             :         /*
    1876                 :             :          * Second verification step: check that the page we read from wasn't
    1877                 :             :          * evicted while we were copying the data.
    1878                 :             :          */
    1879                 :       24536 :         endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
    1880         [ +  + ]:       24536 :         if (expectedEndPtr != endptr)
    1881                 :           1 :             break;
    1882                 :             : 
    1883                 :       24535 :         pdst += npagebytes;
    1884                 :       24535 :         recptr += npagebytes;
    1885                 :       24535 :         nbytes -= npagebytes;
    1886                 :             :     }
    1887                 :             : 
    1888                 :             :     Assert(pdst - dstbuf <= count);
    1889                 :             : 
    1890                 :      105895 :     return pdst - dstbuf;
    1891                 :             : }
    1892                 :             : 
    1893                 :             : /*
    1894                 :             :  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
    1895                 :             :  * is the position starting from the beginning of WAL, excluding all WAL
    1896                 :             :  * page headers.
    1897                 :             :  */
    1898                 :             : static XLogRecPtr
    1899                 :    49571213 : XLogBytePosToRecPtr(uint64 bytepos)
    1900                 :             : {
    1901                 :             :     uint64      fullsegs;
    1902                 :             :     uint64      fullpages;
    1903                 :             :     uint64      bytesleft;
    1904                 :             :     uint32      seg_offset;
    1905                 :             :     XLogRecPtr  result;
    1906                 :             : 
    1907                 :    49571213 :     fullsegs = bytepos / UsableBytesInSegment;
    1908                 :    49571213 :     bytesleft = bytepos % UsableBytesInSegment;
    1909                 :             : 
    1910         [ +  + ]:    49571213 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    1911                 :             :     {
    1912                 :             :         /* fits on first page of segment */
    1913                 :       72018 :         seg_offset = bytesleft + SizeOfXLogLongPHD;
    1914                 :             :     }
    1915                 :             :     else
    1916                 :             :     {
    1917                 :             :         /* account for the first page on segment with long header */
    1918                 :    49499195 :         seg_offset = XLOG_BLCKSZ;
    1919                 :    49499195 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    1920                 :             : 
    1921                 :    49499195 :         fullpages = bytesleft / UsableBytesInPage;
    1922                 :    49499195 :         bytesleft = bytesleft % UsableBytesInPage;
    1923                 :             : 
    1924                 :    49499195 :         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    1925                 :             :     }
    1926                 :             : 
    1927                 :    49571213 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
    1928                 :             : 
    1929                 :    49571213 :     return result;
    1930                 :             : }
    1931                 :             : 
    1932                 :             : /*
    1933                 :             :  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
    1934                 :             :  * returns a pointer to the beginning of the page (ie. before page header),
    1935                 :             :  * not to where the first xlog record on that page would go to. This is used
    1936                 :             :  * when converting a pointer to the end of a record.
    1937                 :             :  */
    1938                 :             : static XLogRecPtr
    1939                 :    25327337 : XLogBytePosToEndRecPtr(uint64 bytepos)
    1940                 :             : {
    1941                 :             :     uint64      fullsegs;
    1942                 :             :     uint64      fullpages;
    1943                 :             :     uint64      bytesleft;
    1944                 :             :     uint32      seg_offset;
    1945                 :             :     XLogRecPtr  result;
    1946                 :             : 
    1947                 :    25327337 :     fullsegs = bytepos / UsableBytesInSegment;
    1948                 :    25327337 :     bytesleft = bytepos % UsableBytesInSegment;
    1949                 :             : 
    1950         [ +  + ]:    25327337 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    1951                 :             :     {
    1952                 :             :         /* fits on first page of segment */
    1953         [ +  + ]:      110470 :         if (bytesleft == 0)
    1954                 :       73028 :             seg_offset = 0;
    1955                 :             :         else
    1956                 :       37442 :             seg_offset = bytesleft + SizeOfXLogLongPHD;
    1957                 :             :     }
    1958                 :             :     else
    1959                 :             :     {
    1960                 :             :         /* account for the first page on segment with long header */
    1961                 :    25216867 :         seg_offset = XLOG_BLCKSZ;
    1962                 :    25216867 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    1963                 :             : 
    1964                 :    25216867 :         fullpages = bytesleft / UsableBytesInPage;
    1965                 :    25216867 :         bytesleft = bytesleft % UsableBytesInPage;
    1966                 :             : 
    1967         [ +  + ]:    25216867 :         if (bytesleft == 0)
    1968                 :       24381 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
    1969                 :             :         else
    1970                 :    25192486 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    1971                 :             :     }
    1972                 :             : 
    1973                 :    25327337 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
    1974                 :             : 
    1975                 :    25327337 :     return result;
    1976                 :             : }
    1977                 :             : 
    1978                 :             : /*
    1979                 :             :  * Convert an XLogRecPtr to a "usable byte position".
    1980                 :             :  */
    1981                 :             : static uint64
    1982                 :        2795 : XLogRecPtrToBytePos(XLogRecPtr ptr)
    1983                 :             : {
    1984                 :             :     uint64      fullsegs;
    1985                 :             :     uint32      fullpages;
    1986                 :             :     uint32      offset;
    1987                 :             :     uint64      result;
    1988                 :             : 
    1989                 :        2795 :     XLByteToSeg(ptr, fullsegs, wal_segment_size);
    1990                 :             : 
    1991                 :        2795 :     fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
    1992                 :        2795 :     offset = ptr % XLOG_BLCKSZ;
    1993                 :             : 
    1994         [ +  + ]:        2795 :     if (fullpages == 0)
    1995                 :             :     {
    1996                 :        1071 :         result = fullsegs * UsableBytesInSegment;
    1997         [ +  + ]:        1071 :         if (offset > 0)
    1998                 :             :         {
    1999                 :             :             Assert(offset >= SizeOfXLogLongPHD);
    2000                 :         289 :             result += offset - SizeOfXLogLongPHD;
    2001                 :             :         }
    2002                 :             :     }
    2003                 :             :     else
    2004                 :             :     {
    2005                 :        1724 :         result = fullsegs * UsableBytesInSegment +
    2006                 :        1724 :             (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
    2007                 :        1724 :             (fullpages - 1) * UsableBytesInPage;    /* full pages */
    2008         [ +  + ]:        1724 :         if (offset > 0)
    2009                 :             :         {
    2010                 :             :             Assert(offset >= SizeOfXLogShortPHD);
    2011                 :        1714 :             result += offset - SizeOfXLogShortPHD;
    2012                 :             :         }
    2013                 :             :     }
    2014                 :             : 
    2015                 :        2795 :     return result;
    2016                 :             : }
    2017                 :             : 
    2018                 :             : /*
    2019                 :             :  * Initialize XLOG buffers, writing out old buffers if they still contain
    2020                 :             :  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
    2021                 :             :  * true, initialize as many pages as we can without having to write out
    2022                 :             :  * unwritten data. Any new pages are initialized to zeros, with pages headers
    2023                 :             :  * initialized properly.
    2024                 :             :  */
    2025                 :             : static void
    2026                 :     2635218 : AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
    2027                 :             : {
    2028                 :             :     int         nextidx;
    2029                 :             :     XLogRecPtr  OldPageRqstPtr;
    2030                 :             :     XLogwrtRqst WriteRqst;
    2031                 :     2635218 :     XLogRecPtr  NewPageEndPtr = InvalidXLogRecPtr;
    2032                 :             :     XLogRecPtr  NewPageBeginPtr;
    2033                 :             :     XLogPageHeader NewPage;
    2034                 :     2635218 :     int         npages pg_attribute_unused() = 0;
    2035                 :             : 
    2036                 :     2635218 :     LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
    2037                 :             : 
    2038                 :             :     /*
    2039                 :             :      * Now that we have the lock, check if someone initialized the page
    2040                 :             :      * already.
    2041                 :             :      */
    2042   [ +  +  +  + ]:     7778362 :     while (upto >= XLogCtl->InitializedUpTo || opportunistic)
    2043                 :             :     {
    2044                 :     5148607 :         nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
    2045                 :             : 
    2046                 :             :         /*
    2047                 :             :          * Get ending-offset of the buffer page we need to replace (this may
    2048                 :             :          * be zero if the buffer hasn't been used yet).  Fall through if it's
    2049                 :             :          * already written out.
    2050                 :             :          */
    2051                 :     5148607 :         OldPageRqstPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]);
    2052         [ +  + ]:     5148607 :         if (LogwrtResult.Write < OldPageRqstPtr)
    2053                 :             :         {
    2054                 :             :             /*
    2055                 :             :              * Nope, got work to do. If we just want to pre-initialize as much
    2056                 :             :              * as we can without flushing, give up now.
    2057                 :             :              */
    2058         [ +  + ]:     2341752 :             if (opportunistic)
    2059                 :        5463 :                 break;
    2060                 :             : 
    2061                 :             :             /* Advance shared memory write request position */
    2062                 :     2336289 :             SpinLockAcquire(&XLogCtl->info_lck);
    2063         [ +  + ]:     2336289 :             if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
    2064                 :      711015 :                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
    2065                 :     2336289 :             SpinLockRelease(&XLogCtl->info_lck);
    2066                 :             : 
    2067                 :             :             /*
    2068                 :             :              * Acquire an up-to-date LogwrtResult value and see if we still
    2069                 :             :              * need to write it or if someone else already did.
    2070                 :             :              */
    2071                 :     2336289 :             RefreshXLogWriteResult(LogwrtResult);
    2072         [ +  + ]:     2336289 :             if (LogwrtResult.Write < OldPageRqstPtr)
    2073                 :             :             {
    2074                 :             :                 /*
    2075                 :             :                  * Must acquire write lock. Release WALBufMappingLock first,
    2076                 :             :                  * to make sure that all insertions that we need to wait for
    2077                 :             :                  * can finish (up to this same position). Otherwise we risk
    2078                 :             :                  * deadlock.
    2079                 :             :                  */
    2080                 :     2324823 :                 LWLockRelease(WALBufMappingLock);
    2081                 :             : 
    2082                 :     2324823 :                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
    2083                 :             : 
    2084                 :     2324823 :                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    2085                 :             : 
    2086                 :     2324823 :                 RefreshXLogWriteResult(LogwrtResult);
    2087         [ +  + ]:     2324823 :                 if (LogwrtResult.Write >= OldPageRqstPtr)
    2088                 :             :                 {
    2089                 :             :                     /* OK, someone wrote it already */
    2090                 :      122918 :                     LWLockRelease(WALWriteLock);
    2091                 :             :                 }
    2092                 :             :                 else
    2093                 :             :                 {
    2094                 :             :                     /* Have to write it ourselves */
    2095                 :             :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
    2096                 :     2201905 :                     WriteRqst.Write = OldPageRqstPtr;
    2097                 :     2201905 :                     WriteRqst.Flush = InvalidXLogRecPtr;
    2098                 :     2201905 :                     XLogWrite(WriteRqst, tli, false);
    2099                 :     2201905 :                     LWLockRelease(WALWriteLock);
    2100                 :     2201905 :                     pgWalUsage.wal_buffers_full++;
    2101                 :             :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
    2102                 :             : 
    2103                 :             :                     /*
    2104                 :             :                      * Required for the flush of pending stats WAL data, per
    2105                 :             :                      * update of pgWalUsage.
    2106                 :             :                      */
    2107                 :     2201905 :                     pgstat_report_fixed = true;
    2108                 :             :                 }
    2109                 :             :                 /* Re-acquire WALBufMappingLock and retry */
    2110                 :     2324823 :                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
    2111                 :     2324823 :                 continue;
    2112                 :             :             }
    2113                 :             :         }
    2114                 :             : 
    2115                 :             :         /*
    2116                 :             :          * Now the next buffer slot is free and we can set it up to be the
    2117                 :             :          * next output page.
    2118                 :             :          */
    2119                 :     2818321 :         NewPageBeginPtr = XLogCtl->InitializedUpTo;
    2120                 :     2818321 :         NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
    2121                 :             : 
    2122                 :             :         Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
    2123                 :             : 
    2124                 :     2818321 :         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
    2125                 :             : 
    2126                 :             :         /*
    2127                 :             :          * Mark the xlblock with InvalidXLogRecPtr and issue a write barrier
    2128                 :             :          * before initializing. Otherwise, the old page may be partially
    2129                 :             :          * zeroed but look valid.
    2130                 :             :          */
    2131                 :     2818321 :         pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], InvalidXLogRecPtr);
    2132                 :     2818321 :         pg_write_barrier();
    2133                 :             : 
    2134                 :             :         /*
    2135                 :             :          * Be sure to re-zero the buffer so that bytes beyond what we've
    2136                 :             :          * written will look like zeroes and not valid XLOG records...
    2137                 :             :          */
    2138   [ +  -  +  -  :     2818321 :         MemSet(NewPage, 0, XLOG_BLCKSZ);
          +  -  -  +  -  
                      - ]
    2139                 :             : 
    2140                 :             :         /*
    2141                 :             :          * Fill the new page's header
    2142                 :             :          */
    2143                 :     2818321 :         NewPage->xlp_magic = XLOG_PAGE_MAGIC;
    2144                 :             : 
    2145                 :             :         /* NewPage->xlp_info = 0; */ /* done by memset */
    2146                 :     2818321 :         NewPage->xlp_tli = tli;
    2147                 :     2818321 :         NewPage->xlp_pageaddr = NewPageBeginPtr;
    2148                 :             : 
    2149                 :             :         /* NewPage->xlp_rem_len = 0; */  /* done by memset */
    2150                 :             : 
    2151                 :             :         /*
    2152                 :             :          * If first page of an XLOG segment file, make it a long header.
    2153                 :             :          */
    2154         [ +  + ]:     2818321 :         if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
    2155                 :             :         {
    2156                 :        1935 :             XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
    2157                 :             : 
    2158                 :        1935 :             NewLongPage->xlp_sysid = ControlFile->system_identifier;
    2159                 :        1935 :             NewLongPage->xlp_seg_size = wal_segment_size;
    2160                 :        1935 :             NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    2161                 :        1935 :             NewPage->xlp_info |= XLP_LONG_HEADER;
    2162                 :             :         }
    2163                 :             : 
    2164                 :             :         /*
    2165                 :             :          * Make sure the initialization of the page becomes visible to others
    2166                 :             :          * before the xlblocks update. GetXLogBuffer() reads xlblocks without
    2167                 :             :          * holding a lock.
    2168                 :             :          */
    2169                 :     2818321 :         pg_write_barrier();
    2170                 :             : 
    2171                 :     2818321 :         pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], NewPageEndPtr);
    2172                 :     2818321 :         XLogCtl->InitializedUpTo = NewPageEndPtr;
    2173                 :             : 
    2174                 :     2818321 :         npages++;
    2175                 :             :     }
    2176                 :     2635218 :     LWLockRelease(WALBufMappingLock);
    2177                 :             : 
    2178                 :             : #ifdef WAL_DEBUG
    2179                 :             :     if (XLOG_DEBUG && npages > 0)
    2180                 :             :     {
    2181                 :             :         elog(DEBUG1, "initialized %d pages, up to %X/%08X",
    2182                 :             :              npages, LSN_FORMAT_ARGS(NewPageEndPtr));
    2183                 :             :     }
    2184                 :             : #endif
    2185                 :     2635218 : }
    2186                 :             : 
    2187                 :             : /*
    2188                 :             :  * Calculate CheckPointSegments based on max_wal_size_mb and
    2189                 :             :  * checkpoint_completion_target.
    2190                 :             :  */
    2191                 :             : static void
    2192                 :        9727 : CalculateCheckpointSegments(void)
    2193                 :             : {
    2194                 :             :     double      target;
    2195                 :             : 
    2196                 :             :     /*-------
    2197                 :             :      * Calculate the distance at which to trigger a checkpoint, to avoid
    2198                 :             :      * exceeding max_wal_size_mb. This is based on two assumptions:
    2199                 :             :      *
    2200                 :             :      * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
    2201                 :             :      *    WAL for two checkpoint cycles to allow us to recover from the
    2202                 :             :      *    secondary checkpoint if the first checkpoint failed, though we
    2203                 :             :      *    only did this on the primary anyway, not on standby. Keeping just
    2204                 :             :      *    one checkpoint simplifies processing and reduces disk space in
    2205                 :             :      *    many smaller databases.)
    2206                 :             :      * b) during checkpoint, we consume checkpoint_completion_target *
    2207                 :             :      *    number of segments consumed between checkpoints.
    2208                 :             :      *-------
    2209                 :             :      */
    2210                 :        9727 :     target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
    2211                 :        9727 :         (1.0 + CheckPointCompletionTarget);
    2212                 :             : 
    2213                 :             :     /* round down */
    2214                 :        9727 :     CheckPointSegments = (int) target;
    2215                 :             : 
    2216         [ +  + ]:        9727 :     if (CheckPointSegments < 1)
    2217                 :          10 :         CheckPointSegments = 1;
    2218                 :        9727 : }
    2219                 :             : 
    2220                 :             : void
    2221                 :        7288 : assign_max_wal_size(int newval, void *extra)
    2222                 :             : {
    2223                 :        7288 :     max_wal_size_mb = newval;
    2224                 :        7288 :     CalculateCheckpointSegments();
    2225                 :        7288 : }
    2226                 :             : 
    2227                 :             : void
    2228                 :        1291 : assign_checkpoint_completion_target(double newval, void *extra)
    2229                 :             : {
    2230                 :        1291 :     CheckPointCompletionTarget = newval;
    2231                 :        1291 :     CalculateCheckpointSegments();
    2232                 :        1291 : }
    2233                 :             : 
    2234                 :             : bool
    2235                 :        2497 : check_wal_segment_size(int *newval, void **extra, GucSource source)
    2236                 :             : {
    2237   [ +  -  +  -  :        2497 :     if (!IsValidWalSegSize(*newval))
             +  -  -  + ]
    2238                 :             :     {
    2239                 :           0 :         GUC_check_errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.");
    2240                 :           0 :         return false;
    2241                 :             :     }
    2242                 :             : 
    2243                 :        2497 :     return true;
    2244                 :             : }
    2245                 :             : 
    2246                 :             : /*
    2247                 :             :  * At a checkpoint, how many WAL segments to recycle as preallocated future
    2248                 :             :  * XLOG segments? Returns the highest segment that should be preallocated.
    2249                 :             :  */
    2250                 :             : static XLogSegNo
    2251                 :        1944 : XLOGfileslop(XLogRecPtr lastredoptr)
    2252                 :             : {
    2253                 :             :     XLogSegNo   minSegNo;
    2254                 :             :     XLogSegNo   maxSegNo;
    2255                 :             :     double      distance;
    2256                 :             :     XLogSegNo   recycleSegNo;
    2257                 :             : 
    2258                 :             :     /*
    2259                 :             :      * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
    2260                 :             :      * correspond to. Always recycle enough segments to meet the minimum, and
    2261                 :             :      * remove enough segments to stay below the maximum.
    2262                 :             :      */
    2263                 :        1944 :     minSegNo = lastredoptr / wal_segment_size +
    2264                 :        1944 :         ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
    2265                 :        1944 :     maxSegNo = lastredoptr / wal_segment_size +
    2266                 :        1944 :         ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
    2267                 :             : 
    2268                 :             :     /*
    2269                 :             :      * Between those limits, recycle enough segments to get us through to the
    2270                 :             :      * estimated end of next checkpoint.
    2271                 :             :      *
    2272                 :             :      * To estimate where the next checkpoint will finish, assume that the
    2273                 :             :      * system runs steadily consuming CheckPointDistanceEstimate bytes between
    2274                 :             :      * every checkpoint.
    2275                 :             :      */
    2276                 :        1944 :     distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
    2277                 :             :     /* add 10% for good measure. */
    2278                 :        1944 :     distance *= 1.10;
    2279                 :             : 
    2280                 :        1944 :     recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
    2281                 :             :                                     wal_segment_size);
    2282                 :             : 
    2283         [ +  + ]:        1944 :     if (recycleSegNo < minSegNo)
    2284                 :        1367 :         recycleSegNo = minSegNo;
    2285         [ +  + ]:        1944 :     if (recycleSegNo > maxSegNo)
    2286                 :         425 :         recycleSegNo = maxSegNo;
    2287                 :             : 
    2288                 :        1944 :     return recycleSegNo;
    2289                 :             : }
    2290                 :             : 
    2291                 :             : /*
    2292                 :             :  * Check whether we've consumed enough xlog space that a checkpoint is needed.
    2293                 :             :  *
    2294                 :             :  * new_segno indicates a log file that has just been filled up (or read
    2295                 :             :  * during recovery). We measure the distance from RedoRecPtr to new_segno
    2296                 :             :  * and see if that exceeds CheckPointSegments.
    2297                 :             :  *
    2298                 :             :  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
    2299                 :             :  */
    2300                 :             : bool
    2301                 :        5017 : XLogCheckpointNeeded(XLogSegNo new_segno)
    2302                 :             : {
    2303                 :             :     XLogSegNo   old_segno;
    2304                 :             : 
    2305                 :        5017 :     XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
    2306                 :             : 
    2307         [ +  + ]:        5017 :     if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
    2308                 :        3066 :         return true;
    2309                 :        1951 :     return false;
    2310                 :             : }
    2311                 :             : 
    2312                 :             : /*
    2313                 :             :  * Write and/or fsync the log at least as far as WriteRqst indicates.
    2314                 :             :  *
    2315                 :             :  * If flexible == true, we don't have to write as far as WriteRqst, but
    2316                 :             :  * may stop at any convenient boundary (such as a cache or logfile boundary).
    2317                 :             :  * This option allows us to avoid uselessly issuing multiple writes when a
    2318                 :             :  * single one would do.
    2319                 :             :  *
    2320                 :             :  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
    2321                 :             :  * must be called before grabbing the lock, to make sure the data is ready to
    2322                 :             :  * write.
    2323                 :             :  */
    2324                 :             : static void
    2325                 :     2370756 : XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
    2326                 :             : {
    2327                 :             :     bool        ispartialpage;
    2328                 :             :     bool        last_iteration;
    2329                 :             :     bool        finishing_seg;
    2330                 :             :     int         curridx;
    2331                 :             :     int         npages;
    2332                 :             :     int         startidx;
    2333                 :             :     uint32      startoffset;
    2334                 :             : 
    2335                 :             :     /* We should always be inside a critical section here */
    2336                 :             :     Assert(CritSectionCount > 0);
    2337                 :             : 
    2338                 :             :     /*
    2339                 :             :      * Update local LogwrtResult (caller probably did this already, but...)
    2340                 :             :      */
    2341                 :     2370756 :     RefreshXLogWriteResult(LogwrtResult);
    2342                 :             : 
    2343                 :             :     /*
    2344                 :             :      * Since successive pages in the xlog cache are consecutively allocated,
    2345                 :             :      * we can usually gather multiple pages together and issue just one
    2346                 :             :      * write() call.  npages is the number of pages we have determined can be
    2347                 :             :      * written together; startidx is the cache block index of the first one,
    2348                 :             :      * and startoffset is the file offset at which it should go. The latter
    2349                 :             :      * two variables are only valid when npages > 0, but we must initialize
    2350                 :             :      * all of them to keep the compiler quiet.
    2351                 :             :      */
    2352                 :     2370756 :     npages = 0;
    2353                 :     2370756 :     startidx = 0;
    2354                 :     2370756 :     startoffset = 0;
    2355                 :             : 
    2356                 :             :     /*
    2357                 :             :      * Within the loop, curridx is the cache block index of the page to
    2358                 :             :      * consider writing.  Begin at the buffer containing the next unwritten
    2359                 :             :      * page, or last partially written page.
    2360                 :             :      */
    2361                 :     2370756 :     curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
    2362                 :             : 
    2363         [ +  + ]:     5144770 :     while (LogwrtResult.Write < WriteRqst.Write)
    2364                 :             :     {
    2365                 :             :         /*
    2366                 :             :          * Make sure we're not ahead of the insert process.  This could happen
    2367                 :             :          * if we're passed a bogus WriteRqst.Write that is past the end of the
    2368                 :             :          * last page that's been initialized by AdvanceXLInsertBuffer.
    2369                 :             :          */
    2370                 :     2937078 :         XLogRecPtr  EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]);
    2371                 :             : 
    2372         [ -  + ]:     2937078 :         if (LogwrtResult.Write >= EndPtr)
    2373         [ #  # ]:           0 :             elog(PANIC, "xlog write request %X/%08X is past end of log %X/%08X",
    2374                 :             :                  LSN_FORMAT_ARGS(LogwrtResult.Write),
    2375                 :             :                  LSN_FORMAT_ARGS(EndPtr));
    2376                 :             : 
    2377                 :             :         /* Advance LogwrtResult.Write to end of current buffer page */
    2378                 :     2937078 :         LogwrtResult.Write = EndPtr;
    2379                 :     2937078 :         ispartialpage = WriteRqst.Write < LogwrtResult.Write;
    2380                 :             : 
    2381         [ +  + ]:     2937078 :         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    2382                 :             :                              wal_segment_size))
    2383                 :             :         {
    2384                 :             :             /*
    2385                 :             :              * Switch to new logfile segment.  We cannot have any pending
    2386                 :             :              * pages here (since we dump what we have at segment end).
    2387                 :             :              */
    2388                 :             :             Assert(npages == 0);
    2389         [ +  + ]:       15067 :             if (openLogFile >= 0)
    2390                 :        6701 :                 XLogFileClose();
    2391                 :       15067 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2392                 :             :                             wal_segment_size);
    2393                 :       15067 :             openLogTLI = tli;
    2394                 :             : 
    2395                 :             :             /* create/use new log file */
    2396                 :       15067 :             openLogFile = XLogFileInit(openLogSegNo, tli);
    2397                 :       15067 :             ReserveExternalFD();
    2398                 :             :         }
    2399                 :             : 
    2400                 :             :         /* Make sure we have the current logfile open */
    2401         [ -  + ]:     2937078 :         if (openLogFile < 0)
    2402                 :             :         {
    2403                 :           0 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2404                 :             :                             wal_segment_size);
    2405                 :           0 :             openLogTLI = tli;
    2406                 :           0 :             openLogFile = XLogFileOpen(openLogSegNo, tli);
    2407                 :           0 :             ReserveExternalFD();
    2408                 :             :         }
    2409                 :             : 
    2410                 :             :         /* Add current page to the set of pending pages-to-dump */
    2411         [ +  + ]:     2937078 :         if (npages == 0)
    2412                 :             :         {
    2413                 :             :             /* first of group */
    2414                 :     2386219 :             startidx = curridx;
    2415                 :     2386219 :             startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
    2416                 :             :                                             wal_segment_size);
    2417                 :             :         }
    2418                 :     2937078 :         npages++;
    2419                 :             : 
    2420                 :             :         /*
    2421                 :             :          * Dump the set if this will be the last loop iteration, or if we are
    2422                 :             :          * at the last page of the cache area (since the next page won't be
    2423                 :             :          * contiguous in memory), or if we are at the end of the logfile
    2424                 :             :          * segment.
    2425                 :             :          */
    2426                 :     2937078 :         last_iteration = WriteRqst.Write <= LogwrtResult.Write;
    2427                 :             : 
    2428         [ +  + ]:     5715540 :         finishing_seg = !ispartialpage &&
    2429         [ +  + ]:     2778462 :             (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
    2430                 :             : 
    2431         [ +  + ]:     2937078 :         if (last_iteration ||
    2432   [ +  +  -  + ]:      569236 :             curridx == XLogCtl->XLogCacheBlck ||
    2433                 :             :             finishing_seg)
    2434                 :             :         {
    2435                 :             :             char       *from;
    2436                 :             :             Size        nbytes;
    2437                 :             :             Size        nleft;
    2438                 :             :             ssize_t     written;
    2439                 :             :             instr_time  start;
    2440                 :             : 
    2441                 :             :             /* OK to write the page(s) */
    2442                 :     2386219 :             from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
    2443                 :     2386219 :             nbytes = npages * (Size) XLOG_BLCKSZ;
    2444                 :     2386219 :             nleft = nbytes;
    2445                 :             :             do
    2446                 :             :             {
    2447                 :     2386219 :                 errno = 0;
    2448                 :             : 
    2449                 :             :                 /*
    2450                 :             :                  * Measure I/O timing to write WAL data, for pg_stat_io.
    2451                 :             :                  */
    2452                 :     2386219 :                 start = pgstat_prepare_io_time(track_wal_io_timing);
    2453                 :             : 
    2454                 :     2386219 :                 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
    2455                 :     2386219 :                 written = pg_pwrite(openLogFile, from, nleft, startoffset);
    2456                 :     2386219 :                 pgstat_report_wait_end();
    2457                 :             : 
    2458         [ -  + ]:     2386219 :                 if (written <= 0)
    2459                 :             :                 {
    2460                 :             :                     char        xlogfname[MAXFNAMELEN];
    2461                 :             :                     int         save_errno;
    2462                 :             : 
    2463         [ #  # ]:           0 :                     if (errno == EINTR)
    2464                 :           0 :                         continue;
    2465                 :             : 
    2466                 :           0 :                     save_errno = errno;
    2467                 :           0 :                     XLogFileName(xlogfname, tli, openLogSegNo,
    2468                 :             :                                  wal_segment_size);
    2469                 :           0 :                     errno = save_errno;
    2470         [ #  # ]:           0 :                     ereport(PANIC,
    2471                 :             :                             (errcode_for_file_access(),
    2472                 :             :                              errmsg("could not write to log file \"%s\" at offset %u, length %zu: %m",
    2473                 :             :                                     xlogfname, startoffset, nleft)));
    2474                 :             :                 }
    2475                 :             : 
    2476                 :     2386219 :                 pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL,
    2477                 :             :                                         IOOP_WRITE, start, 1, written);
    2478                 :     2386219 :                 nleft -= written;
    2479                 :     2386219 :                 from += written;
    2480                 :     2386219 :                 startoffset += written;
    2481         [ -  + ]:     2386219 :             } while (nleft > 0);
    2482                 :             : 
    2483                 :     2386219 :             npages = 0;
    2484                 :             : 
    2485                 :             :             /*
    2486                 :             :              * If we just wrote the whole last page of a logfile segment,
    2487                 :             :              * fsync the segment immediately.  This avoids having to go back
    2488                 :             :              * and re-open prior segments when an fsync request comes along
    2489                 :             :              * later. Doing it here ensures that one and only one backend will
    2490                 :             :              * perform this fsync.
    2491                 :             :              *
    2492                 :             :              * This is also the right place to notify the Archiver that the
    2493                 :             :              * segment is ready to copy to archival storage, and to update the
    2494                 :             :              * timer for archive_timeout, and to signal for a checkpoint if
    2495                 :             :              * too many logfile segments have been used since the last
    2496                 :             :              * checkpoint.
    2497                 :             :              */
    2498         [ +  + ]:     2386219 :             if (finishing_seg)
    2499                 :             :             {
    2500                 :        2063 :                 issue_xlog_fsync(openLogFile, openLogSegNo, tli);
    2501                 :             : 
    2502                 :             :                 /* signal that we need to wakeup walsenders later */
    2503                 :        2063 :                 WalSndWakeupRequest();
    2504                 :             : 
    2505                 :        2063 :                 LogwrtResult.Flush = LogwrtResult.Write;    /* end of page */
    2506                 :             : 
    2507         [ +  + ]:        2063 :                 if (XLogArchivingActive())
    2508                 :         416 :                     XLogArchiveNotifySeg(openLogSegNo, tli);
    2509                 :             : 
    2510                 :        2063 :                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    2511                 :        2063 :                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
    2512                 :             : 
    2513                 :             :                 /*
    2514                 :             :                  * Request a checkpoint if we've consumed too much xlog since
    2515                 :             :                  * the last one.  For speed, we first check using the local
    2516                 :             :                  * copy of RedoRecPtr, which might be out of date; if it looks
    2517                 :             :                  * like a checkpoint is needed, forcibly update RedoRecPtr and
    2518                 :             :                  * recheck.
    2519                 :             :                  */
    2520   [ +  +  +  + ]:        2063 :                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
    2521                 :             :                 {
    2522                 :         279 :                     (void) GetRedoRecPtr();
    2523         [ +  + ]:         279 :                     if (XLogCheckpointNeeded(openLogSegNo))
    2524                 :         221 :                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
    2525                 :             :                 }
    2526                 :             :             }
    2527                 :             :         }
    2528                 :             : 
    2529         [ +  + ]:     2937078 :         if (ispartialpage)
    2530                 :             :         {
    2531                 :             :             /* Only asked to write a partial page */
    2532                 :      158616 :             LogwrtResult.Write = WriteRqst.Write;
    2533                 :      158616 :             break;
    2534                 :             :         }
    2535         [ +  + ]:     2778462 :         curridx = NextBufIdx(curridx);
    2536                 :             : 
    2537                 :             :         /* If flexible, break out of loop as soon as we wrote something */
    2538   [ +  +  +  + ]:     2778462 :         if (flexible && npages == 0)
    2539                 :        4448 :             break;
    2540                 :             :     }
    2541                 :             : 
    2542                 :             :     Assert(npages == 0);
    2543                 :             : 
    2544                 :             :     /*
    2545                 :             :      * If asked to flush, do so
    2546                 :             :      */
    2547         [ +  + ]:     2370756 :     if (LogwrtResult.Flush < WriteRqst.Flush &&
    2548         [ +  + ]:      168066 :         LogwrtResult.Flush < LogwrtResult.Write)
    2549                 :             :     {
    2550                 :             :         /*
    2551                 :             :          * Could get here without iterating above loop, in which case we might
    2552                 :             :          * have no open file or the wrong one.  However, we do not need to
    2553                 :             :          * fsync more than one file.
    2554                 :             :          */
    2555         [ +  - ]:      167982 :         if (wal_sync_method != WAL_SYNC_METHOD_OPEN &&
    2556         [ +  - ]:      167982 :             wal_sync_method != WAL_SYNC_METHOD_OPEN_DSYNC)
    2557                 :             :         {
    2558         [ +  + ]:      167982 :             if (openLogFile >= 0 &&
    2559         [ +  + ]:      167955 :                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    2560                 :             :                                  wal_segment_size))
    2561                 :         180 :                 XLogFileClose();
    2562         [ +  + ]:      167982 :             if (openLogFile < 0)
    2563                 :             :             {
    2564                 :         207 :                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2565                 :             :                                 wal_segment_size);
    2566                 :         207 :                 openLogTLI = tli;
    2567                 :         207 :                 openLogFile = XLogFileOpen(openLogSegNo, tli);
    2568                 :         207 :                 ReserveExternalFD();
    2569                 :             :             }
    2570                 :             : 
    2571                 :      167982 :             issue_xlog_fsync(openLogFile, openLogSegNo, tli);
    2572                 :             :         }
    2573                 :             : 
    2574                 :             :         /* signal that we need to wakeup walsenders later */
    2575                 :      167982 :         WalSndWakeupRequest();
    2576                 :             : 
    2577                 :      167982 :         LogwrtResult.Flush = LogwrtResult.Write;
    2578                 :             :     }
    2579                 :             : 
    2580                 :             :     /*
    2581                 :             :      * Update shared-memory status
    2582                 :             :      *
    2583                 :             :      * We make sure that the shared 'request' values do not fall behind the
    2584                 :             :      * 'result' values.  This is not absolutely essential, but it saves some
    2585                 :             :      * code in a couple of places.
    2586                 :             :      */
    2587                 :     2370756 :     SpinLockAcquire(&XLogCtl->info_lck);
    2588         [ +  + ]:     2370756 :     if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
    2589                 :      145674 :         XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
    2590         [ +  + ]:     2370756 :     if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
    2591                 :      169625 :         XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
    2592                 :     2370756 :     SpinLockRelease(&XLogCtl->info_lck);
    2593                 :             : 
    2594                 :             :     /*
    2595                 :             :      * We write Write first, bar, then Flush.  When reading, the opposite must
    2596                 :             :      * be done (with a matching barrier in between), so that we always see a
    2597                 :             :      * Flush value that trails behind the Write value seen.
    2598                 :             :      */
    2599                 :     2370756 :     pg_atomic_write_u64(&XLogCtl->logWriteResult, LogwrtResult.Write);
    2600                 :     2370756 :     pg_write_barrier();
    2601                 :     2370756 :     pg_atomic_write_u64(&XLogCtl->logFlushResult, LogwrtResult.Flush);
    2602                 :             : 
    2603                 :             : #ifdef USE_ASSERT_CHECKING
    2604                 :             :     {
    2605                 :             :         XLogRecPtr  Flush;
    2606                 :             :         XLogRecPtr  Write;
    2607                 :             :         XLogRecPtr  Insert;
    2608                 :             : 
    2609                 :             :         Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult);
    2610                 :             :         pg_read_barrier();
    2611                 :             :         Write = pg_atomic_read_u64(&XLogCtl->logWriteResult);
    2612                 :             :         pg_read_barrier();
    2613                 :             :         Insert = pg_atomic_read_u64(&XLogCtl->logInsertResult);
    2614                 :             : 
    2615                 :             :         /* WAL written to disk is always ahead of WAL flushed */
    2616                 :             :         Assert(Write >= Flush);
    2617                 :             : 
    2618                 :             :         /* WAL inserted to buffers is always ahead of WAL written */
    2619                 :             :         Assert(Insert >= Write);
    2620                 :             :     }
    2621                 :             : #endif
    2622                 :     2370756 : }
    2623                 :             : 
    2624                 :             : /*
    2625                 :             :  * Record the LSN for an asynchronous transaction commit/abort
    2626                 :             :  * and nudge the WALWriter if there is work for it to do.
    2627                 :             :  * (This should not be called for synchronous commits.)
    2628                 :             :  */
    2629                 :             : void
    2630                 :       62264 : XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
    2631                 :             : {
    2632                 :       62264 :     XLogRecPtr  WriteRqstPtr = asyncXactLSN;
    2633                 :             :     bool        sleeping;
    2634                 :       62264 :     bool        wakeup = false;
    2635                 :             :     XLogRecPtr  prevAsyncXactLSN;
    2636                 :             : 
    2637                 :       62264 :     SpinLockAcquire(&XLogCtl->info_lck);
    2638                 :       62264 :     sleeping = XLogCtl->WalWriterSleeping;
    2639                 :       62264 :     prevAsyncXactLSN = XLogCtl->asyncXactLSN;
    2640         [ +  + ]:       62264 :     if (XLogCtl->asyncXactLSN < asyncXactLSN)
    2641                 :       61668 :         XLogCtl->asyncXactLSN = asyncXactLSN;
    2642                 :       62264 :     SpinLockRelease(&XLogCtl->info_lck);
    2643                 :             : 
    2644                 :             :     /*
    2645                 :             :      * If somebody else already called this function with a more aggressive
    2646                 :             :      * LSN, they will have done what we needed (and perhaps more).
    2647                 :             :      */
    2648         [ +  + ]:       62264 :     if (asyncXactLSN <= prevAsyncXactLSN)
    2649                 :         596 :         return;
    2650                 :             : 
    2651                 :             :     /*
    2652                 :             :      * If the WALWriter is sleeping, kick it to make it come out of low-power
    2653                 :             :      * mode, so that this async commit will reach disk within the expected
    2654                 :             :      * amount of time.  Otherwise, determine whether it has enough WAL
    2655                 :             :      * available to flush, the same way that XLogBackgroundFlush() does.
    2656                 :             :      */
    2657         [ +  + ]:       61668 :     if (sleeping)
    2658                 :          29 :         wakeup = true;
    2659                 :             :     else
    2660                 :             :     {
    2661                 :             :         int         flushblocks;
    2662                 :             : 
    2663                 :       61639 :         RefreshXLogWriteResult(LogwrtResult);
    2664                 :             : 
    2665                 :       61639 :         flushblocks =
    2666                 :       61639 :             WriteRqstPtr / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
    2667                 :             : 
    2668   [ +  -  +  + ]:       61639 :         if (WalWriterFlushAfter == 0 || flushblocks >= WalWriterFlushAfter)
    2669                 :        5132 :             wakeup = true;
    2670                 :             :     }
    2671                 :             : 
    2672         [ +  + ]:       61668 :     if (wakeup)
    2673                 :             :     {
    2674                 :        5161 :         volatile PROC_HDR *procglobal = ProcGlobal;
    2675                 :        5161 :         ProcNumber  walwriterProc = procglobal->walwriterProc;
    2676                 :             : 
    2677         [ +  + ]:        5161 :         if (walwriterProc != INVALID_PROC_NUMBER)
    2678                 :         861 :             SetLatch(&GetPGProcByNumber(walwriterProc)->procLatch);
    2679                 :             :     }
    2680                 :             : }
    2681                 :             : 
    2682                 :             : /*
    2683                 :             :  * Record the LSN up to which we can remove WAL because it's not required by
    2684                 :             :  * any replication slot.
    2685                 :             :  */
    2686                 :             : void
    2687                 :       48810 : XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
    2688                 :             : {
    2689                 :       48810 :     SpinLockAcquire(&XLogCtl->info_lck);
    2690                 :       48810 :     XLogCtl->replicationSlotMinLSN = lsn;
    2691                 :       48810 :     SpinLockRelease(&XLogCtl->info_lck);
    2692                 :       48810 : }
    2693                 :             : 
    2694                 :             : 
    2695                 :             : /*
    2696                 :             :  * Return the oldest LSN we must retain to satisfy the needs of some
    2697                 :             :  * replication slot.
    2698                 :             :  */
    2699                 :             : XLogRecPtr
    2700                 :        2563 : XLogGetReplicationSlotMinimumLSN(void)
    2701                 :             : {
    2702                 :             :     XLogRecPtr  retval;
    2703                 :             : 
    2704                 :        2563 :     SpinLockAcquire(&XLogCtl->info_lck);
    2705                 :        2563 :     retval = XLogCtl->replicationSlotMinLSN;
    2706                 :        2563 :     SpinLockRelease(&XLogCtl->info_lck);
    2707                 :             : 
    2708                 :        2563 :     return retval;
    2709                 :             : }
    2710                 :             : 
    2711                 :             : /*
    2712                 :             :  * Advance minRecoveryPoint in control file.
    2713                 :             :  *
    2714                 :             :  * If we crash during recovery, we must reach this point again before the
    2715                 :             :  * database is consistent.
    2716                 :             :  *
    2717                 :             :  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
    2718                 :             :  * is only updated if it's not already greater than or equal to 'lsn'.
    2719                 :             :  */
    2720                 :             : static void
    2721                 :      127586 : UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
    2722                 :             : {
    2723                 :             :     /* Quick check using our local copy of the variable */
    2724   [ +  +  +  +  :      127586 :     if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
                   +  + ]
    2725                 :      119808 :         return;
    2726                 :             : 
    2727                 :             :     /*
    2728                 :             :      * An invalid minRecoveryPoint means that we need to recover all the WAL,
    2729                 :             :      * i.e., we're doing crash recovery.  We never modify the control file's
    2730                 :             :      * value in that case, so we can short-circuit future checks here too. The
    2731                 :             :      * local values of minRecoveryPoint and minRecoveryPointTLI should not be
    2732                 :             :      * updated until crash recovery finishes.  We only do this for the startup
    2733                 :             :      * process as it should not update its own reference of minRecoveryPoint
    2734                 :             :      * until it has finished crash recovery to make sure that all WAL
    2735                 :             :      * available is replayed in this case.  This also saves from extra locks
    2736                 :             :      * taken on the control file from the startup process.
    2737                 :             :      */
    2738   [ +  +  +  + ]:        7778 :     if (!XLogRecPtrIsValid(LocalMinRecoveryPoint) && InRecovery)
    2739                 :             :     {
    2740                 :          32 :         updateMinRecoveryPoint = false;
    2741                 :          32 :         return;
    2742                 :             :     }
    2743                 :             : 
    2744                 :        7746 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    2745                 :             : 
    2746                 :             :     /* update local copy */
    2747                 :        7746 :     LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    2748                 :        7746 :     LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    2749                 :             : 
    2750         [ +  + ]:        7746 :     if (!XLogRecPtrIsValid(LocalMinRecoveryPoint))
    2751                 :           1 :         updateMinRecoveryPoint = false;
    2752   [ +  +  +  + ]:        7745 :     else if (force || LocalMinRecoveryPoint < lsn)
    2753                 :             :     {
    2754                 :             :         XLogRecPtr  newMinRecoveryPoint;
    2755                 :             :         TimeLineID  newMinRecoveryPointTLI;
    2756                 :             : 
    2757                 :             :         /*
    2758                 :             :          * To avoid having to update the control file too often, we update it
    2759                 :             :          * all the way to the last record being replayed, even though 'lsn'
    2760                 :             :          * would suffice for correctness.  This also allows the 'force' case
    2761                 :             :          * to not need a valid 'lsn' value.
    2762                 :             :          *
    2763                 :             :          * Another important reason for doing it this way is that the passed
    2764                 :             :          * 'lsn' value could be bogus, i.e., past the end of available WAL, if
    2765                 :             :          * the caller got it from a corrupted heap page.  Accepting such a
    2766                 :             :          * value as the min recovery point would prevent us from coming up at
    2767                 :             :          * all.  Instead, we just log a warning and continue with recovery.
    2768                 :             :          * (See also the comments about corrupt LSNs in XLogFlush.)
    2769                 :             :          */
    2770                 :        6085 :         newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
    2771   [ +  +  -  + ]:        6085 :         if (!force && newMinRecoveryPoint < lsn)
    2772         [ #  # ]:           0 :             elog(WARNING,
    2773                 :             :                  "xlog min recovery request %X/%08X is past current point %X/%08X",
    2774                 :             :                  LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
    2775                 :             : 
    2776                 :             :         /* update control file */
    2777         [ +  + ]:        6085 :         if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
    2778                 :             :         {
    2779                 :        5730 :             ControlFile->minRecoveryPoint = newMinRecoveryPoint;
    2780                 :        5730 :             ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
    2781                 :        5730 :             UpdateControlFile();
    2782                 :        5730 :             LocalMinRecoveryPoint = newMinRecoveryPoint;
    2783                 :        5730 :             LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
    2784                 :             : 
    2785         [ +  + ]:        5730 :             ereport(DEBUG2,
    2786                 :             :                     errmsg_internal("updated min recovery point to %X/%08X on timeline %u",
    2787                 :             :                                     LSN_FORMAT_ARGS(newMinRecoveryPoint),
    2788                 :             :                                     newMinRecoveryPointTLI));
    2789                 :             :         }
    2790                 :             :     }
    2791                 :        7746 :     LWLockRelease(ControlFileLock);
    2792                 :             : }
    2793                 :             : 
    2794                 :             : /*
    2795                 :             :  * Ensure that all XLOG data through the given position is flushed to disk.
    2796                 :             :  *
    2797                 :             :  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
    2798                 :             :  * already held, and we try to avoid acquiring it if possible.
    2799                 :             :  */
    2800                 :             : void
    2801                 :      885914 : XLogFlush(XLogRecPtr record)
    2802                 :             : {
    2803                 :             :     XLogRecPtr  WriteRqstPtr;
    2804                 :             :     XLogwrtRqst WriteRqst;
    2805                 :      885914 :     TimeLineID  insertTLI = XLogCtl->InsertTimeLineID;
    2806                 :             : 
    2807                 :             :     /*
    2808                 :             :      * During REDO, we are reading not writing WAL.  Therefore, instead of
    2809                 :             :      * trying to flush the WAL, we should update minRecoveryPoint instead. We
    2810                 :             :      * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
    2811                 :             :      * to act this way too, and because when it tries to write the
    2812                 :             :      * end-of-recovery checkpoint, it should indeed flush.
    2813                 :             :      */
    2814         [ +  + ]:      885914 :     if (!XLogInsertAllowed())
    2815                 :             :     {
    2816                 :      127110 :         UpdateMinRecoveryPoint(record, false);
    2817                 :      705649 :         return;
    2818                 :             :     }
    2819                 :             : 
    2820                 :             :     /* Quick exit if already known flushed */
    2821         [ +  + ]:      758804 :     if (record <= LogwrtResult.Flush)
    2822                 :      578539 :         return;
    2823                 :             : 
    2824                 :             : #ifdef WAL_DEBUG
    2825                 :             :     if (XLOG_DEBUG)
    2826                 :             :         elog(LOG, "xlog flush request %X/%08X; write %X/%08X; flush %X/%08X",
    2827                 :             :              LSN_FORMAT_ARGS(record),
    2828                 :             :              LSN_FORMAT_ARGS(LogwrtResult.Write),
    2829                 :             :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    2830                 :             : #endif
    2831                 :             : 
    2832                 :      180265 :     START_CRIT_SECTION();
    2833                 :             : 
    2834                 :             :     /*
    2835                 :             :      * Since fsync is usually a horribly expensive operation, we try to
    2836                 :             :      * piggyback as much data as we can on each fsync: if we see any more data
    2837                 :             :      * entered into the xlog buffer, we'll write and fsync that too, so that
    2838                 :             :      * the final value of LogwrtResult.Flush is as large as possible. This
    2839                 :             :      * gives us some chance of avoiding another fsync immediately after.
    2840                 :             :      */
    2841                 :             : 
    2842                 :             :     /* initialize to given target; may increase below */
    2843                 :      180265 :     WriteRqstPtr = record;
    2844                 :             : 
    2845                 :             :     /*
    2846                 :             :      * Now wait until we get the write lock, or someone else does the flush
    2847                 :             :      * for us.
    2848                 :             :      */
    2849                 :             :     for (;;)
    2850                 :        2418 :     {
    2851                 :             :         XLogRecPtr  insertpos;
    2852                 :             : 
    2853                 :             :         /* done already? */
    2854                 :      182683 :         RefreshXLogWriteResult(LogwrtResult);
    2855         [ +  + ]:      182683 :         if (record <= LogwrtResult.Flush)
    2856                 :       12632 :             break;
    2857                 :             : 
    2858                 :             :         /*
    2859                 :             :          * Before actually performing the write, wait for all in-flight
    2860                 :             :          * insertions to the pages we're about to write to finish.
    2861                 :             :          */
    2862                 :      170051 :         SpinLockAcquire(&XLogCtl->info_lck);
    2863         [ +  + ]:      170051 :         if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
    2864                 :       12700 :             WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
    2865                 :      170051 :         SpinLockRelease(&XLogCtl->info_lck);
    2866                 :      170051 :         insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
    2867                 :             : 
    2868                 :             :         /*
    2869                 :             :          * Try to get the write lock. If we can't get it immediately, wait
    2870                 :             :          * until it's released, and recheck if we still need to do the flush
    2871                 :             :          * or if the backend that held the lock did it for us already. This
    2872                 :             :          * helps to maintain a good rate of group committing when the system
    2873                 :             :          * is bottlenecked by the speed of fsyncing.
    2874                 :             :          */
    2875         [ +  + ]:      170051 :         if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
    2876                 :             :         {
    2877                 :             :             /*
    2878                 :             :              * The lock is now free, but we didn't acquire it yet. Before we
    2879                 :             :              * do, loop back to check if someone else flushed the record for
    2880                 :             :              * us already.
    2881                 :             :              */
    2882                 :        2418 :             continue;
    2883                 :             :         }
    2884                 :             : 
    2885                 :             :         /* Got the lock; recheck whether request is satisfied */
    2886                 :      167633 :         RefreshXLogWriteResult(LogwrtResult);
    2887         [ +  + ]:      167633 :         if (record <= LogwrtResult.Flush)
    2888                 :             :         {
    2889                 :        4143 :             LWLockRelease(WALWriteLock);
    2890                 :        4143 :             break;
    2891                 :             :         }
    2892                 :             : 
    2893                 :             :         /*
    2894                 :             :          * Sleep before flush! By adding a delay here, we may give further
    2895                 :             :          * backends the opportunity to join the backlog of group commit
    2896                 :             :          * followers; this can significantly improve transaction throughput,
    2897                 :             :          * at the risk of increasing transaction latency.
    2898                 :             :          *
    2899                 :             :          * We do not sleep if enableFsync is not turned on, nor if there are
    2900                 :             :          * fewer than CommitSiblings other backends with active transactions.
    2901                 :             :          */
    2902   [ -  +  -  -  :      163490 :         if (CommitDelay > 0 && enableFsync &&
                   -  - ]
    2903                 :           0 :             MinimumActiveBackends(CommitSiblings))
    2904                 :             :         {
    2905                 :           0 :             pgstat_report_wait_start(WAIT_EVENT_COMMIT_DELAY);
    2906                 :           0 :             pg_usleep(CommitDelay);
    2907                 :           0 :             pgstat_report_wait_end();
    2908                 :             : 
    2909                 :             :             /*
    2910                 :             :              * Re-check how far we can now flush the WAL. It's generally not
    2911                 :             :              * safe to call WaitXLogInsertionsToFinish while holding
    2912                 :             :              * WALWriteLock, because an in-progress insertion might need to
    2913                 :             :              * also grab WALWriteLock to make progress. But we know that all
    2914                 :             :              * the insertions up to insertpos have already finished, because
    2915                 :             :              * that's what the earlier WaitXLogInsertionsToFinish() returned.
    2916                 :             :              * We're only calling it again to allow insertpos to be moved
    2917                 :             :              * further forward, not to actually wait for anyone.
    2918                 :             :              */
    2919                 :           0 :             insertpos = WaitXLogInsertionsToFinish(insertpos);
    2920                 :             :         }
    2921                 :             : 
    2922                 :             :         /* try to write/flush later additions to XLOG as well */
    2923                 :      163490 :         WriteRqst.Write = insertpos;
    2924                 :      163490 :         WriteRqst.Flush = insertpos;
    2925                 :             : 
    2926                 :      163490 :         XLogWrite(WriteRqst, insertTLI, false);
    2927                 :             : 
    2928                 :      163490 :         LWLockRelease(WALWriteLock);
    2929                 :             :         /* done */
    2930                 :      163490 :         break;
    2931                 :             :     }
    2932                 :             : 
    2933                 :      180265 :     END_CRIT_SECTION();
    2934                 :             : 
    2935                 :             :     /* wake up walsenders now that we've released heavily contended locks */
    2936                 :      180265 :     WalSndWakeupProcessRequests(true, !RecoveryInProgress());
    2937                 :             : 
    2938                 :             :     /*
    2939                 :             :      * Wake up processes waiting for primary flush LSN to reach current flush
    2940                 :             :      * position.
    2941                 :             :      */
    2942                 :      180265 :     WaitLSNWakeup(WAIT_LSN_TYPE_PRIMARY_FLUSH, LogwrtResult.Flush);
    2943                 :             : 
    2944                 :             :     /*
    2945                 :             :      * If we still haven't flushed to the request point then we have a
    2946                 :             :      * problem; most likely, the requested flush point is past end of XLOG.
    2947                 :             :      * This has been seen to occur when a disk page has a corrupted LSN.
    2948                 :             :      *
    2949                 :             :      * Formerly we treated this as a PANIC condition, but that hurts the
    2950                 :             :      * system's robustness rather than helping it: we do not want to take down
    2951                 :             :      * the whole system due to corruption on one data page.  In particular, if
    2952                 :             :      * the bad page is encountered again during recovery then we would be
    2953                 :             :      * unable to restart the database at all!  (This scenario actually
    2954                 :             :      * happened in the field several times with 7.1 releases.)  As of 8.4, bad
    2955                 :             :      * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
    2956                 :             :      * the only time we can reach here during recovery is while flushing the
    2957                 :             :      * end-of-recovery checkpoint record, and we don't expect that to have a
    2958                 :             :      * bad LSN.
    2959                 :             :      *
    2960                 :             :      * Note that for calls from xact.c, the ERROR will be promoted to PANIC
    2961                 :             :      * since xact.c calls this routine inside a critical section.  However,
    2962                 :             :      * calls from bufmgr.c are not within critical sections and so we will not
    2963                 :             :      * force a restart for a bad LSN on a data page.
    2964                 :             :      */
    2965         [ -  + ]:      180265 :     if (LogwrtResult.Flush < record)
    2966         [ #  # ]:           0 :         elog(ERROR,
    2967                 :             :              "xlog flush request %X/%08X is not satisfied --- flushed only to %X/%08X",
    2968                 :             :              LSN_FORMAT_ARGS(record),
    2969                 :             :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    2970                 :             : 
    2971                 :             :     /*
    2972                 :             :      * Cross-check XLogNeedsFlush().  Some of the checks of XLogFlush() and
    2973                 :             :      * XLogNeedsFlush() are duplicated, and this assertion ensures that these
    2974                 :             :      * remain consistent.
    2975                 :             :      */
    2976                 :             :     Assert(!XLogNeedsFlush(record));
    2977                 :             : }
    2978                 :             : 
    2979                 :             : /*
    2980                 :             :  * Write & flush xlog, but without specifying exactly where to.
    2981                 :             :  *
    2982                 :             :  * We normally write only completed blocks; but if there is nothing to do on
    2983                 :             :  * that basis, we check for unwritten async commits in the current incomplete
    2984                 :             :  * block, and write through the latest one of those.  Thus, if async commits
    2985                 :             :  * are not being used, we will write complete blocks only.
    2986                 :             :  *
    2987                 :             :  * If, based on the above, there's anything to write we do so immediately. But
    2988                 :             :  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
    2989                 :             :  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
    2990                 :             :  * more than wal_writer_flush_after unflushed blocks.
    2991                 :             :  *
    2992                 :             :  * We can guarantee that async commits reach disk after at most three
    2993                 :             :  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
    2994                 :             :  * to write "flexibly", meaning it can stop at the end of the buffer ring;
    2995                 :             :  * this makes a difference only with very high load or long wal_writer_delay,
    2996                 :             :  * but imposes one extra cycle for the worst case for async commits.)
    2997                 :             :  *
    2998                 :             :  * This routine is invoked periodically by the background walwriter process.
    2999                 :             :  *
    3000                 :             :  * Returns true if there was any work to do, even if we skipped flushing due
    3001                 :             :  * to wal_writer_delay/wal_writer_flush_after.
    3002                 :             :  */
    3003                 :             : bool
    3004                 :       15706 : XLogBackgroundFlush(void)
    3005                 :             : {
    3006                 :             :     XLogwrtRqst WriteRqst;
    3007                 :       15706 :     bool        flexible = true;
    3008                 :             :     static TimestampTz lastflush;
    3009                 :             :     TimestampTz now;
    3010                 :             :     int         flushblocks;
    3011                 :             :     TimeLineID  insertTLI;
    3012                 :             : 
    3013                 :             :     /* XLOG doesn't need flushing during recovery */
    3014         [ -  + ]:       15706 :     if (RecoveryInProgress())
    3015                 :           0 :         return false;
    3016                 :             : 
    3017                 :             :     /*
    3018                 :             :      * Since we're not in recovery, InsertTimeLineID is set and can't change,
    3019                 :             :      * so we can read it without a lock.
    3020                 :             :      */
    3021                 :       15706 :     insertTLI = XLogCtl->InsertTimeLineID;
    3022                 :             : 
    3023                 :             :     /* read updated LogwrtRqst */
    3024                 :       15706 :     SpinLockAcquire(&XLogCtl->info_lck);
    3025                 :       15706 :     WriteRqst = XLogCtl->LogwrtRqst;
    3026                 :       15706 :     SpinLockRelease(&XLogCtl->info_lck);
    3027                 :             : 
    3028                 :             :     /* back off to last completed page boundary */
    3029                 :       15706 :     WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
    3030                 :             : 
    3031                 :             :     /* if we have already flushed that far, consider async commit records */
    3032                 :       15706 :     RefreshXLogWriteResult(LogwrtResult);
    3033         [ +  + ]:       15706 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    3034                 :             :     {
    3035                 :       11052 :         SpinLockAcquire(&XLogCtl->info_lck);
    3036                 :       11052 :         WriteRqst.Write = XLogCtl->asyncXactLSN;
    3037                 :       11052 :         SpinLockRelease(&XLogCtl->info_lck);
    3038                 :       11052 :         flexible = false;       /* ensure it all gets written */
    3039                 :             :     }
    3040                 :             : 
    3041                 :             :     /*
    3042                 :             :      * If already known flushed, we're done. Just need to check if we are
    3043                 :             :      * holding an open file handle to a logfile that's no longer in use,
    3044                 :             :      * preventing the file from being deleted.
    3045                 :             :      */
    3046         [ +  + ]:       15706 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    3047                 :             :     {
    3048         [ +  + ]:       10243 :         if (openLogFile >= 0)
    3049                 :             :         {
    3050         [ +  + ]:        6791 :             if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    3051                 :             :                                  wal_segment_size))
    3052                 :             :             {
    3053                 :         201 :                 XLogFileClose();
    3054                 :             :             }
    3055                 :             :         }
    3056                 :       10243 :         return false;
    3057                 :             :     }
    3058                 :             : 
    3059                 :             :     /*
    3060                 :             :      * Determine how far to flush WAL, based on the wal_writer_delay and
    3061                 :             :      * wal_writer_flush_after GUCs.
    3062                 :             :      *
    3063                 :             :      * Note that XLogSetAsyncXactLSN() performs similar calculation based on
    3064                 :             :      * wal_writer_flush_after, to decide when to wake us up.  Make sure the
    3065                 :             :      * logic is the same in both places if you change this.
    3066                 :             :      */
    3067                 :        5463 :     now = GetCurrentTimestamp();
    3068                 :        5463 :     flushblocks =
    3069                 :        5463 :         WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
    3070                 :             : 
    3071   [ +  -  +  + ]:        5463 :     if (WalWriterFlushAfter == 0 || lastflush == 0)
    3072                 :             :     {
    3073                 :             :         /* first call, or block based limits disabled */
    3074                 :         310 :         WriteRqst.Flush = WriteRqst.Write;
    3075                 :         310 :         lastflush = now;
    3076                 :             :     }
    3077         [ +  + ]:        5153 :     else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
    3078                 :             :     {
    3079                 :             :         /*
    3080                 :             :          * Flush the writes at least every WalWriterDelay ms. This is
    3081                 :             :          * important to bound the amount of time it takes for an asynchronous
    3082                 :             :          * commit to hit disk.
    3083                 :             :          */
    3084                 :        4907 :         WriteRqst.Flush = WriteRqst.Write;
    3085                 :        4907 :         lastflush = now;
    3086                 :             :     }
    3087         [ +  + ]:         246 :     else if (flushblocks >= WalWriterFlushAfter)
    3088                 :             :     {
    3089                 :             :         /* exceeded wal_writer_flush_after blocks, flush */
    3090                 :         223 :         WriteRqst.Flush = WriteRqst.Write;
    3091                 :         223 :         lastflush = now;
    3092                 :             :     }
    3093                 :             :     else
    3094                 :             :     {
    3095                 :             :         /* no flushing, this time round */
    3096                 :          23 :         WriteRqst.Flush = InvalidXLogRecPtr;
    3097                 :             :     }
    3098                 :             : 
    3099                 :             : #ifdef WAL_DEBUG
    3100                 :             :     if (XLOG_DEBUG)
    3101                 :             :         elog(LOG, "xlog bg flush request write %X/%08X; flush: %X/%08X, current is write %X/%08X; flush %X/%08X",
    3102                 :             :              LSN_FORMAT_ARGS(WriteRqst.Write),
    3103                 :             :              LSN_FORMAT_ARGS(WriteRqst.Flush),
    3104                 :             :              LSN_FORMAT_ARGS(LogwrtResult.Write),
    3105                 :             :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    3106                 :             : #endif
    3107                 :             : 
    3108                 :        5463 :     START_CRIT_SECTION();
    3109                 :             : 
    3110                 :             :     /* now wait for any in-progress insertions to finish and get write lock */
    3111                 :        5463 :     WaitXLogInsertionsToFinish(WriteRqst.Write);
    3112                 :        5463 :     LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    3113                 :        5463 :     RefreshXLogWriteResult(LogwrtResult);
    3114         [ +  + ]:        5463 :     if (WriteRqst.Write > LogwrtResult.Write ||
    3115         [ +  + ]:         218 :         WriteRqst.Flush > LogwrtResult.Flush)
    3116                 :             :     {
    3117                 :        5361 :         XLogWrite(WriteRqst, insertTLI, flexible);
    3118                 :             :     }
    3119                 :        5463 :     LWLockRelease(WALWriteLock);
    3120                 :             : 
    3121                 :        5463 :     END_CRIT_SECTION();
    3122                 :             : 
    3123                 :             :     /* wake up walsenders now that we've released heavily contended locks */
    3124                 :        5463 :     WalSndWakeupProcessRequests(true, !RecoveryInProgress());
    3125                 :             : 
    3126                 :             :     /*
    3127                 :             :      * Wake up processes waiting for primary flush LSN to reach current flush
    3128                 :             :      * position.
    3129                 :             :      */
    3130                 :        5463 :     WaitLSNWakeup(WAIT_LSN_TYPE_PRIMARY_FLUSH, LogwrtResult.Flush);
    3131                 :             : 
    3132                 :             :     /*
    3133                 :             :      * Great, done. To take some work off the critical path, try to initialize
    3134                 :             :      * as many of the no-longer-needed WAL buffers for future use as we can.
    3135                 :             :      */
    3136                 :        5463 :     AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
    3137                 :             : 
    3138                 :             :     /*
    3139                 :             :      * If we determined that we need to write data, but somebody else
    3140                 :             :      * wrote/flushed already, it should be considered as being active, to
    3141                 :             :      * avoid hibernating too early.
    3142                 :             :      */
    3143                 :        5463 :     return true;
    3144                 :             : }
    3145                 :             : 
    3146                 :             : /*
    3147                 :             :  * Test whether XLOG data has been flushed up to (at least) the given
    3148                 :             :  * position, or whether the minimum recovery point has been updated past
    3149                 :             :  * the given position.
    3150                 :             :  *
    3151                 :             :  * Returns true if a flush is still needed, or if the minimum recovery point
    3152                 :             :  * must be updated.
    3153                 :             :  *
    3154                 :             :  * It is possible that someone else is already in the process of flushing
    3155                 :             :  * that far, or has updated the minimum recovery point up to the given
    3156                 :             :  * position.
    3157                 :             :  */
    3158                 :             : bool
    3159                 :    16312631 : XLogNeedsFlush(XLogRecPtr record)
    3160                 :             : {
    3161                 :             :     /*
    3162                 :             :      * During recovery, we don't flush WAL but update minRecoveryPoint
    3163                 :             :      * instead. So "needs flush" is taken to mean whether minRecoveryPoint
    3164                 :             :      * would need to be updated.
    3165                 :             :      *
    3166                 :             :      * Using XLogInsertAllowed() rather than RecoveryInProgress() matters for
    3167                 :             :      * the case of an end-of-recovery checkpoint, where WAL data is flushed.
    3168                 :             :      * This check should be consistent with the one in XLogFlush().
    3169                 :             :      */
    3170         [ +  + ]:    16312631 :     if (!XLogInsertAllowed())
    3171                 :             :     {
    3172                 :             :         /* Quick exit if already known to be updated or cannot be updated */
    3173   [ +  +  +  + ]:      557764 :         if (!updateMinRecoveryPoint || record <= LocalMinRecoveryPoint)
    3174                 :      551055 :             return false;
    3175                 :             : 
    3176                 :             :         /*
    3177                 :             :          * An invalid minRecoveryPoint means that we need to recover all the
    3178                 :             :          * WAL, i.e., we're doing crash recovery.  We never modify the control
    3179                 :             :          * file's value in that case, so we can short-circuit future checks
    3180                 :             :          * here too.  This triggers a quick exit path for the startup process,
    3181                 :             :          * which cannot update its local copy of minRecoveryPoint as long as
    3182                 :             :          * it has not replayed all WAL available when doing crash recovery.
    3183                 :             :          */
    3184   [ +  +  -  + ]:        6709 :         if (!XLogRecPtrIsValid(LocalMinRecoveryPoint) && InRecovery)
    3185                 :             :         {
    3186                 :           0 :             updateMinRecoveryPoint = false;
    3187                 :           0 :             return false;
    3188                 :             :         }
    3189                 :             : 
    3190                 :             :         /*
    3191                 :             :          * Update local copy of minRecoveryPoint. But if the lock is busy,
    3192                 :             :          * just return a conservative guess.
    3193                 :             :          */
    3194         [ -  + ]:        6709 :         if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
    3195                 :           0 :             return true;
    3196                 :        6709 :         LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    3197                 :        6709 :         LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    3198                 :        6709 :         LWLockRelease(ControlFileLock);
    3199                 :             : 
    3200                 :             :         /*
    3201                 :             :          * Check minRecoveryPoint for any other process than the startup
    3202                 :             :          * process doing crash recovery, which should not update the control
    3203                 :             :          * file value if crash recovery is still running.
    3204                 :             :          */
    3205         [ +  + ]:        6709 :         if (!XLogRecPtrIsValid(LocalMinRecoveryPoint))
    3206                 :           2 :             updateMinRecoveryPoint = false;
    3207                 :             : 
    3208                 :             :         /* check again */
    3209   [ +  +  +  + ]:        6709 :         if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
    3210                 :          97 :             return false;
    3211                 :             :         else
    3212                 :        6612 :             return true;
    3213                 :             :     }
    3214                 :             : 
    3215                 :             :     /* Quick exit if already known flushed */
    3216         [ +  + ]:    15754867 :     if (record <= LogwrtResult.Flush)
    3217                 :    15528849 :         return false;
    3218                 :             : 
    3219                 :             :     /* read LogwrtResult and update local state */
    3220                 :      226018 :     RefreshXLogWriteResult(LogwrtResult);
    3221                 :             : 
    3222                 :             :     /* check again */
    3223         [ +  + ]:      226018 :     if (record <= LogwrtResult.Flush)
    3224                 :        3356 :         return false;
    3225                 :             : 
    3226                 :      222662 :     return true;
    3227                 :             : }
    3228                 :             : 
    3229                 :             : /*
    3230                 :             :  * Try to make a given XLOG file segment exist.
    3231                 :             :  *
    3232                 :             :  * logsegno: identify segment.
    3233                 :             :  *
    3234                 :             :  * *added: on return, true if this call raised the number of extant segments.
    3235                 :             :  *
    3236                 :             :  * path: on return, this char[MAXPGPATH] has the path to the logsegno file.
    3237                 :             :  *
    3238                 :             :  * Returns -1 or FD of opened file.  A -1 here is not an error; a caller
    3239                 :             :  * wanting an open segment should attempt to open "path", which usually will
    3240                 :             :  * succeed.  (This is weird, but it's efficient for the callers.)
    3241                 :             :  */
    3242                 :             : static int
    3243                 :       16247 : XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
    3244                 :             :                      bool *added, char *path)
    3245                 :             : {
    3246                 :             :     char        tmppath[MAXPGPATH];
    3247                 :             :     XLogSegNo   installed_segno;
    3248                 :             :     XLogSegNo   max_segno;
    3249                 :             :     int         fd;
    3250                 :             :     int         save_errno;
    3251                 :       16247 :     int         open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
    3252                 :             :     instr_time  io_start;
    3253                 :             : 
    3254                 :             :     Assert(logtli != 0);
    3255                 :             : 
    3256                 :       16247 :     XLogFilePath(path, logtli, logsegno, wal_segment_size);
    3257                 :             : 
    3258                 :             :     /*
    3259                 :             :      * Try to use existent file (checkpoint maker may have created it already)
    3260                 :             :      */
    3261                 :       16247 :     *added = false;
    3262                 :       16247 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    3263                 :       16247 :                        get_sync_bit(wal_sync_method));
    3264         [ +  + ]:       16247 :     if (fd < 0)
    3265                 :             :     {
    3266         [ -  + ]:        1505 :         if (errno != ENOENT)
    3267         [ #  # ]:           0 :             ereport(ERROR,
    3268                 :             :                     (errcode_for_file_access(),
    3269                 :             :                      errmsg("could not open file \"%s\": %m", path)));
    3270                 :             :     }
    3271                 :             :     else
    3272                 :       14742 :         return fd;
    3273                 :             : 
    3274                 :             :     /*
    3275                 :             :      * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
    3276                 :             :      * another process is doing the same thing.  If so, we will end up
    3277                 :             :      * pre-creating an extra log segment.  That seems OK, and better than
    3278                 :             :      * holding the lock throughout this lengthy process.
    3279                 :             :      */
    3280         [ +  + ]:        1505 :     elog(DEBUG2, "creating and filling new WAL file");
    3281                 :             : 
    3282                 :        1505 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3283                 :             : 
    3284                 :        1505 :     unlink(tmppath);
    3285                 :             : 
    3286         [ -  + ]:        1505 :     if (io_direct_flags & IO_DIRECT_WAL_INIT)
    3287                 :           0 :         open_flags |= PG_O_DIRECT;
    3288                 :             : 
    3289                 :             :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    3290                 :        1505 :     fd = BasicOpenFile(tmppath, open_flags);
    3291         [ -  + ]:        1505 :     if (fd < 0)
    3292         [ #  # ]:           0 :         ereport(ERROR,
    3293                 :             :                 (errcode_for_file_access(),
    3294                 :             :                  errmsg("could not create file \"%s\": %m", tmppath)));
    3295                 :             : 
    3296                 :             :     /* Measure I/O timing when initializing segment */
    3297                 :        1505 :     io_start = pgstat_prepare_io_time(track_wal_io_timing);
    3298                 :             : 
    3299                 :        1505 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
    3300                 :        1505 :     save_errno = 0;
    3301         [ +  - ]:        1505 :     if (wal_init_zero)
    3302                 :             :     {
    3303                 :             :         ssize_t     rc;
    3304                 :             : 
    3305                 :             :         /*
    3306                 :             :          * Zero-fill the file.  With this setting, we do this the hard way to
    3307                 :             :          * ensure that all the file space has really been allocated.  On
    3308                 :             :          * platforms that allow "holes" in files, just seeking to the end
    3309                 :             :          * doesn't allocate intermediate space.  This way, we know that we
    3310                 :             :          * have all the space and (after the fsync below) that all the
    3311                 :             :          * indirect blocks are down on disk.  Therefore, fdatasync(2) or
    3312                 :             :          * O_DSYNC will be sufficient to sync future writes to the log file.
    3313                 :             :          */
    3314                 :        1505 :         rc = pg_pwrite_zeros(fd, wal_segment_size, 0);
    3315                 :             : 
    3316         [ -  + ]:        1505 :         if (rc < 0)
    3317                 :           0 :             save_errno = errno;
    3318                 :             :     }
    3319                 :             :     else
    3320                 :             :     {
    3321                 :             :         /*
    3322                 :             :          * Otherwise, seeking to the end and writing a solitary byte is
    3323                 :             :          * enough.
    3324                 :             :          */
    3325                 :           0 :         errno = 0;
    3326         [ #  # ]:           0 :         if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
    3327                 :             :         {
    3328                 :             :             /* if write didn't set errno, assume no disk space */
    3329         [ #  # ]:           0 :             save_errno = errno ? errno : ENOSPC;
    3330                 :             :         }
    3331                 :             :     }
    3332                 :        1505 :     pgstat_report_wait_end();
    3333                 :             : 
    3334         [ -  + ]:        1505 :     if (save_errno)
    3335                 :             :     {
    3336                 :             :         /*
    3337                 :             :          * If we fail to make the file, delete it to release disk space
    3338                 :             :          */
    3339                 :           0 :         unlink(tmppath);
    3340                 :             : 
    3341                 :           0 :         close(fd);
    3342                 :             : 
    3343                 :           0 :         errno = save_errno;
    3344                 :             : 
    3345         [ #  # ]:           0 :         ereport(ERROR,
    3346                 :             :                 (errcode_for_file_access(),
    3347                 :             :                  errmsg("could not write to file \"%s\": %m", tmppath)));
    3348                 :             :     }
    3349                 :             : 
    3350                 :             :     /*
    3351                 :             :      * A full segment worth of data is written when using wal_init_zero. One
    3352                 :             :      * byte is written when not using it.
    3353                 :             :      */
    3354                 :        1505 :     pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE,
    3355                 :             :                             io_start, 1,
    3356         [ +  - ]:        1505 :                             wal_init_zero ? wal_segment_size : 1);
    3357                 :             : 
    3358                 :             :     /* Measure I/O timing when flushing segment */
    3359                 :        1505 :     io_start = pgstat_prepare_io_time(track_wal_io_timing);
    3360                 :             : 
    3361                 :        1505 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
    3362         [ -  + ]:        1505 :     if (pg_fsync(fd) != 0)
    3363                 :             :     {
    3364                 :           0 :         save_errno = errno;
    3365                 :           0 :         close(fd);
    3366                 :           0 :         errno = save_errno;
    3367         [ #  # ]:           0 :         ereport(ERROR,
    3368                 :             :                 (errcode_for_file_access(),
    3369                 :             :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3370                 :             :     }
    3371                 :        1505 :     pgstat_report_wait_end();
    3372                 :             : 
    3373                 :        1505 :     pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT,
    3374                 :             :                             IOOP_FSYNC, io_start, 1, 0);
    3375                 :             : 
    3376         [ -  + ]:        1505 :     if (close(fd) != 0)
    3377         [ #  # ]:           0 :         ereport(ERROR,
    3378                 :             :                 (errcode_for_file_access(),
    3379                 :             :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3380                 :             : 
    3381                 :             :     /*
    3382                 :             :      * Now move the segment into place with its final name.  Cope with
    3383                 :             :      * possibility that someone else has created the file while we were
    3384                 :             :      * filling ours: if so, use ours to pre-create a future log segment.
    3385                 :             :      */
    3386                 :        1505 :     installed_segno = logsegno;
    3387                 :             : 
    3388                 :             :     /*
    3389                 :             :      * XXX: What should we use as max_segno? We used to use XLOGfileslop when
    3390                 :             :      * that was a constant, but that was always a bit dubious: normally, at a
    3391                 :             :      * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
    3392                 :             :      * here, it was the offset from the insert location. We can't do the
    3393                 :             :      * normal XLOGfileslop calculation here because we don't have access to
    3394                 :             :      * the prior checkpoint's redo location. So somewhat arbitrarily, just use
    3395                 :             :      * CheckPointSegments.
    3396                 :             :      */
    3397                 :        1505 :     max_segno = logsegno + CheckPointSegments;
    3398         [ +  - ]:        1505 :     if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
    3399                 :             :                                logtli))
    3400                 :             :     {
    3401                 :        1505 :         *added = true;
    3402         [ +  + ]:        1505 :         elog(DEBUG2, "done creating and filling new WAL file");
    3403                 :             :     }
    3404                 :             :     else
    3405                 :             :     {
    3406                 :             :         /*
    3407                 :             :          * No need for any more future segments, or InstallXLogFileSegment()
    3408                 :             :          * failed to rename the file into place. If the rename failed, a
    3409                 :             :          * caller opening the file may fail.
    3410                 :             :          */
    3411                 :           0 :         unlink(tmppath);
    3412         [ #  # ]:           0 :         elog(DEBUG2, "abandoned new WAL file");
    3413                 :             :     }
    3414                 :             : 
    3415                 :        1505 :     return -1;
    3416                 :             : }
    3417                 :             : 
    3418                 :             : /*
    3419                 :             :  * Create a new XLOG file segment, or open a pre-existing one.
    3420                 :             :  *
    3421                 :             :  * logsegno: identify segment to be created/opened.
    3422                 :             :  *
    3423                 :             :  * Returns FD of opened file.
    3424                 :             :  *
    3425                 :             :  * Note: errors here are ERROR not PANIC because we might or might not be
    3426                 :             :  * inside a critical section (eg, during checkpoint there is no reason to
    3427                 :             :  * take down the system on failure).  They will promote to PANIC if we are
    3428                 :             :  * in a critical section.
    3429                 :             :  */
    3430                 :             : int
    3431                 :       16032 : XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
    3432                 :             : {
    3433                 :             :     bool        ignore_added;
    3434                 :             :     char        path[MAXPGPATH];
    3435                 :             :     int         fd;
    3436                 :             : 
    3437                 :             :     Assert(logtli != 0);
    3438                 :             : 
    3439                 :       16032 :     fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
    3440         [ +  + ]:       16032 :     if (fd >= 0)
    3441                 :       14591 :         return fd;
    3442                 :             : 
    3443                 :             :     /* Now open original target segment (might not be file I just made) */
    3444                 :        1441 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    3445                 :        1441 :                        get_sync_bit(wal_sync_method));
    3446         [ -  + ]:        1441 :     if (fd < 0)
    3447         [ #  # ]:           0 :         ereport(ERROR,
    3448                 :             :                 (errcode_for_file_access(),
    3449                 :             :                  errmsg("could not open file \"%s\": %m", path)));
    3450                 :        1441 :     return fd;
    3451                 :             : }
    3452                 :             : 
    3453                 :             : /*
    3454                 :             :  * Create a new XLOG file segment by copying a pre-existing one.
    3455                 :             :  *
    3456                 :             :  * destsegno: identify segment to be created.
    3457                 :             :  *
    3458                 :             :  * srcTLI, srcsegno: identify segment to be copied (could be from
    3459                 :             :  *      a different timeline)
    3460                 :             :  *
    3461                 :             :  * upto: how much of the source file to copy (the rest is filled with
    3462                 :             :  *      zeros)
    3463                 :             :  *
    3464                 :             :  * Currently this is only used during recovery, and so there are no locking
    3465                 :             :  * considerations.  But we should be just as tense as XLogFileInit to avoid
    3466                 :             :  * emplacing a bogus file.
    3467                 :             :  */
    3468                 :             : static void
    3469                 :          47 : XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
    3470                 :             :              TimeLineID srcTLI, XLogSegNo srcsegno,
    3471                 :             :              int upto)
    3472                 :             : {
    3473                 :             :     char        path[MAXPGPATH];
    3474                 :             :     char        tmppath[MAXPGPATH];
    3475                 :             :     PGAlignedXLogBlock buffer;
    3476                 :             :     int         srcfd;
    3477                 :             :     int         fd;
    3478                 :             :     int         nbytes;
    3479                 :             : 
    3480                 :             :     /*
    3481                 :             :      * Open the source file
    3482                 :             :      */
    3483                 :          47 :     XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
    3484                 :          47 :     srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
    3485         [ -  + ]:          47 :     if (srcfd < 0)
    3486         [ #  # ]:           0 :         ereport(ERROR,
    3487                 :             :                 (errcode_for_file_access(),
    3488                 :             :                  errmsg("could not open file \"%s\": %m", path)));
    3489                 :             : 
    3490                 :             :     /*
    3491                 :             :      * Copy into a temp file name.
    3492                 :             :      */
    3493                 :          47 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3494                 :             : 
    3495                 :          47 :     unlink(tmppath);
    3496                 :             : 
    3497                 :             :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    3498                 :          47 :     fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
    3499         [ -  + ]:          47 :     if (fd < 0)
    3500         [ #  # ]:           0 :         ereport(ERROR,
    3501                 :             :                 (errcode_for_file_access(),
    3502                 :             :                  errmsg("could not create file \"%s\": %m", tmppath)));
    3503                 :             : 
    3504                 :             :     /*
    3505                 :             :      * Do the data copying.
    3506                 :             :      */
    3507         [ +  + ]:       96303 :     for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
    3508                 :             :     {
    3509                 :             :         int         nread;
    3510                 :             : 
    3511                 :       96256 :         nread = upto - nbytes;
    3512                 :             : 
    3513                 :             :         /*
    3514                 :             :          * The part that is not read from the source file is filled with
    3515                 :             :          * zeros.
    3516                 :             :          */
    3517         [ +  + ]:       96256 :         if (nread < sizeof(buffer))
    3518                 :          47 :             memset(buffer.data, 0, sizeof(buffer));
    3519                 :             : 
    3520         [ +  + ]:       96256 :         if (nread > 0)
    3521                 :             :         {
    3522                 :             :             int         r;
    3523                 :             : 
    3524         [ +  + ]:        4425 :             if (nread > sizeof(buffer))
    3525                 :        4378 :                 nread = sizeof(buffer);
    3526                 :        4425 :             pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
    3527                 :        4425 :             r = read(srcfd, buffer.data, nread);
    3528         [ -  + ]:        4425 :             if (r != nread)
    3529                 :             :             {
    3530         [ #  # ]:           0 :                 if (r < 0)
    3531         [ #  # ]:           0 :                     ereport(ERROR,
    3532                 :             :                             (errcode_for_file_access(),
    3533                 :             :                              errmsg("could not read file \"%s\": %m",
    3534                 :             :                                     path)));
    3535                 :             :                 else
    3536         [ #  # ]:           0 :                     ereport(ERROR,
    3537                 :             :                             (errcode(ERRCODE_DATA_CORRUPTED),
    3538                 :             :                              errmsg("could not read file \"%s\": read %d of %zu",
    3539                 :             :                                     path, r, (Size) nread)));
    3540                 :             :             }
    3541                 :        4425 :             pgstat_report_wait_end();
    3542                 :             :         }
    3543                 :       96256 :         errno = 0;
    3544                 :       96256 :         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
    3545         [ -  + ]:       96256 :         if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
    3546                 :             :         {
    3547                 :           0 :             int         save_errno = errno;
    3548                 :             : 
    3549                 :             :             /*
    3550                 :             :              * If we fail to make the file, delete it to release disk space
    3551                 :             :              */
    3552                 :           0 :             unlink(tmppath);
    3553                 :             :             /* if write didn't set errno, assume problem is no disk space */
    3554         [ #  # ]:           0 :             errno = save_errno ? save_errno : ENOSPC;
    3555                 :             : 
    3556         [ #  # ]:           0 :             ereport(ERROR,
    3557                 :             :                     (errcode_for_file_access(),
    3558                 :             :                      errmsg("could not write to file \"%s\": %m", tmppath)));
    3559                 :             :         }
    3560                 :       96256 :         pgstat_report_wait_end();
    3561                 :             :     }
    3562                 :             : 
    3563                 :          47 :     pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
    3564         [ -  + ]:          47 :     if (pg_fsync(fd) != 0)
    3565         [ #  # ]:           0 :         ereport(data_sync_elevel(ERROR),
    3566                 :             :                 (errcode_for_file_access(),
    3567                 :             :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3568                 :          47 :     pgstat_report_wait_end();
    3569                 :             : 
    3570         [ -  + ]:          47 :     if (CloseTransientFile(fd) != 0)
    3571         [ #  # ]:           0 :         ereport(ERROR,
    3572                 :             :                 (errcode_for_file_access(),
    3573                 :             :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3574                 :             : 
    3575         [ -  + ]:          47 :     if (CloseTransientFile(srcfd) != 0)
    3576         [ #  # ]:           0 :         ereport(ERROR,
    3577                 :             :                 (errcode_for_file_access(),
    3578                 :             :                  errmsg("could not close file \"%s\": %m", path)));
    3579                 :             : 
    3580                 :             :     /*
    3581                 :             :      * Now move the segment into place with its final name.
    3582                 :             :      */
    3583         [ -  + ]:          47 :     if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
    3584         [ #  # ]:           0 :         elog(ERROR, "InstallXLogFileSegment should not have failed");
    3585                 :          47 : }
    3586                 :             : 
    3587                 :             : /*
    3588                 :             :  * Install a new XLOG segment file as a current or future log segment.
    3589                 :             :  *
    3590                 :             :  * This is used both to install a newly-created segment (which has a temp
    3591                 :             :  * filename while it's being created) and to recycle an old segment.
    3592                 :             :  *
    3593                 :             :  * *segno: identify segment to install as (or first possible target).
    3594                 :             :  * When find_free is true, this is modified on return to indicate the
    3595                 :             :  * actual installation location or last segment searched.
    3596                 :             :  *
    3597                 :             :  * tmppath: initial name of file to install.  It will be renamed into place.
    3598                 :             :  *
    3599                 :             :  * find_free: if true, install the new segment at the first empty segno
    3600                 :             :  * number at or after the passed numbers.  If false, install the new segment
    3601                 :             :  * exactly where specified, deleting any existing segment file there.
    3602                 :             :  *
    3603                 :             :  * max_segno: maximum segment number to install the new file as.  Fail if no
    3604                 :             :  * free slot is found between *segno and max_segno. (Ignored when find_free
    3605                 :             :  * is false.)
    3606                 :             :  *
    3607                 :             :  * tli: The timeline on which the new segment should be installed.
    3608                 :             :  *
    3609                 :             :  * Returns true if the file was installed successfully.  false indicates that
    3610                 :             :  * max_segno limit was exceeded, the startup process has disabled this
    3611                 :             :  * function for now, or an error occurred while renaming the file into place.
    3612                 :             :  */
    3613                 :             : static bool
    3614                 :        3280 : InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
    3615                 :             :                        bool find_free, XLogSegNo max_segno, TimeLineID tli)
    3616                 :             : {
    3617                 :             :     char        path[MAXPGPATH];
    3618                 :             :     struct stat stat_buf;
    3619                 :             : 
    3620                 :             :     Assert(tli != 0);
    3621                 :             : 
    3622                 :        3280 :     XLogFilePath(path, tli, *segno, wal_segment_size);
    3623                 :             : 
    3624                 :        3280 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    3625         [ -  + ]:        3280 :     if (!XLogCtl->InstallXLogFileSegmentActive)
    3626                 :             :     {
    3627                 :           0 :         LWLockRelease(ControlFileLock);
    3628                 :           0 :         return false;
    3629                 :             :     }
    3630                 :             : 
    3631         [ +  + ]:        3280 :     if (!find_free)
    3632                 :             :     {
    3633                 :             :         /* Force installation: get rid of any pre-existing segment file */
    3634                 :          47 :         durable_unlink(path, DEBUG1);
    3635                 :             :     }
    3636                 :             :     else
    3637                 :             :     {
    3638                 :             :         /* Find a free slot to put it in */
    3639         [ +  + ]:        4541 :         while (stat(path, &stat_buf) == 0)
    3640                 :             :         {
    3641         [ +  + ]:        1472 :             if ((*segno) >= max_segno)
    3642                 :             :             {
    3643                 :             :                 /* Failed to find a free slot within specified range */
    3644                 :         164 :                 LWLockRelease(ControlFileLock);
    3645                 :         164 :                 return false;
    3646                 :             :             }
    3647                 :        1308 :             (*segno)++;
    3648                 :        1308 :             XLogFilePath(path, tli, *segno, wal_segment_size);
    3649                 :             :         }
    3650                 :             :     }
    3651                 :             : 
    3652                 :             :     Assert(access(path, F_OK) != 0 && errno == ENOENT);
    3653         [ -  + ]:        3116 :     if (durable_rename(tmppath, path, LOG) != 0)
    3654                 :             :     {
    3655                 :           0 :         LWLockRelease(ControlFileLock);
    3656                 :             :         /* durable_rename already emitted log message */
    3657                 :           0 :         return false;
    3658                 :             :     }
    3659                 :             : 
    3660                 :        3116 :     LWLockRelease(ControlFileLock);
    3661                 :             : 
    3662                 :        3116 :     return true;
    3663                 :             : }
    3664                 :             : 
    3665                 :             : /*
    3666                 :             :  * Open a pre-existing logfile segment for writing.
    3667                 :             :  */
    3668                 :             : int
    3669                 :         207 : XLogFileOpen(XLogSegNo segno, TimeLineID tli)
    3670                 :             : {
    3671                 :             :     char        path[MAXPGPATH];
    3672                 :             :     int         fd;
    3673                 :             : 
    3674                 :         207 :     XLogFilePath(path, tli, segno, wal_segment_size);
    3675                 :             : 
    3676                 :         207 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    3677                 :         207 :                        get_sync_bit(wal_sync_method));
    3678         [ -  + ]:         207 :     if (fd < 0)
    3679         [ #  # ]:           0 :         ereport(PANIC,
    3680                 :             :                 (errcode_for_file_access(),
    3681                 :             :                  errmsg("could not open file \"%s\": %m", path)));
    3682                 :             : 
    3683                 :         207 :     return fd;
    3684                 :             : }
    3685                 :             : 
    3686                 :             : /*
    3687                 :             :  * Close the current logfile segment for writing.
    3688                 :             :  */
    3689                 :             : static void
    3690                 :        7082 : XLogFileClose(void)
    3691                 :             : {
    3692                 :             :     Assert(openLogFile >= 0);
    3693                 :             : 
    3694                 :             :     /*
    3695                 :             :      * WAL segment files will not be re-read in normal operation, so we advise
    3696                 :             :      * the OS to release any cached pages.  But do not do so if WAL archiving
    3697                 :             :      * or streaming is active, because archiver and walsender process could
    3698                 :             :      * use the cache to read the WAL segment.
    3699                 :             :      */
    3700                 :             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
    3701   [ +  +  +  - ]:        7082 :     if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
    3702                 :         139 :         (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
    3703                 :             : #endif
    3704                 :             : 
    3705         [ -  + ]:        7082 :     if (close(openLogFile) != 0)
    3706                 :             :     {
    3707                 :             :         char        xlogfname[MAXFNAMELEN];
    3708                 :           0 :         int         save_errno = errno;
    3709                 :             : 
    3710                 :           0 :         XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
    3711                 :           0 :         errno = save_errno;
    3712         [ #  # ]:           0 :         ereport(PANIC,
    3713                 :             :                 (errcode_for_file_access(),
    3714                 :             :                  errmsg("could not close file \"%s\": %m", xlogfname)));
    3715                 :             :     }
    3716                 :             : 
    3717                 :        7082 :     openLogFile = -1;
    3718                 :        7082 :     ReleaseExternalFD();
    3719                 :        7082 : }
    3720                 :             : 
    3721                 :             : /*
    3722                 :             :  * Preallocate log files beyond the specified log endpoint.
    3723                 :             :  *
    3724                 :             :  * XXX this is currently extremely conservative, since it forces only one
    3725                 :             :  * future log segment to exist, and even that only if we are 75% done with
    3726                 :             :  * the current one.  This is only appropriate for very low-WAL-volume systems.
    3727                 :             :  * High-volume systems will be OK once they've built up a sufficient set of
    3728                 :             :  * recycled log segments, but the startup transient is likely to include
    3729                 :             :  * a lot of segment creations by foreground processes, which is not so good.
    3730                 :             :  *
    3731                 :             :  * XLogFileInitInternal() can ereport(ERROR).  All known causes indicate big
    3732                 :             :  * trouble; for example, a full filesystem is one cause.  The checkpoint WAL
    3733                 :             :  * and/or ControlFile updates already completed.  If a RequestCheckpoint()
    3734                 :             :  * initiated the present checkpoint and an ERROR ends this function, the
    3735                 :             :  * command that called RequestCheckpoint() fails.  That's not ideal, but it's
    3736                 :             :  * not worth contorting more functions to use caller-specified elevel values.
    3737                 :             :  * (With or without RequestCheckpoint(), an ERROR forestalls some inessential
    3738                 :             :  * reporting and resource reclamation.)
    3739                 :             :  */
    3740                 :             : static void
    3741                 :        2226 : PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
    3742                 :             : {
    3743                 :             :     XLogSegNo   _logSegNo;
    3744                 :             :     int         lf;
    3745                 :             :     bool        added;
    3746                 :             :     char        path[MAXPGPATH];
    3747                 :             :     uint64      offset;
    3748                 :             : 
    3749         [ +  + ]:        2226 :     if (!XLogCtl->InstallXLogFileSegmentActive)
    3750                 :          10 :         return;                 /* unlocked check says no */
    3751                 :             : 
    3752                 :        2216 :     XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
    3753                 :        2216 :     offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
    3754         [ +  + ]:        2216 :     if (offset >= (uint32) (0.75 * wal_segment_size))
    3755                 :             :     {
    3756                 :         215 :         _logSegNo++;
    3757                 :         215 :         lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
    3758         [ +  + ]:         215 :         if (lf >= 0)
    3759                 :         151 :             close(lf);
    3760         [ +  + ]:         215 :         if (added)
    3761                 :          64 :             CheckpointStats.ckpt_segs_added++;
    3762                 :             :     }
    3763                 :             : }
    3764                 :             : 
    3765                 :             : /*
    3766                 :             :  * Throws an error if the given log segment has already been removed or
    3767                 :             :  * recycled. The caller should only pass a segment that it knows to have
    3768                 :             :  * existed while the server has been running, as this function always
    3769                 :             :  * succeeds if no WAL segments have been removed since startup.
    3770                 :             :  * 'tli' is only used in the error message.
    3771                 :             :  *
    3772                 :             :  * Note: this function guarantees to keep errno unchanged on return.
    3773                 :             :  * This supports callers that use this to possibly deliver a better
    3774                 :             :  * error message about a missing file, while still being able to throw
    3775                 :             :  * a normal file-access error afterwards, if this does return.
    3776                 :             :  */
    3777                 :             : void
    3778                 :      127759 : CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
    3779                 :             : {
    3780                 :      127759 :     int         save_errno = errno;
    3781                 :             :     XLogSegNo   lastRemovedSegNo;
    3782                 :             : 
    3783                 :      127759 :     SpinLockAcquire(&XLogCtl->info_lck);
    3784                 :      127759 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3785                 :      127759 :     SpinLockRelease(&XLogCtl->info_lck);
    3786                 :             : 
    3787         [ -  + ]:      127759 :     if (segno <= lastRemovedSegNo)
    3788                 :             :     {
    3789                 :             :         char        filename[MAXFNAMELEN];
    3790                 :             : 
    3791                 :           0 :         XLogFileName(filename, tli, segno, wal_segment_size);
    3792                 :           0 :         errno = save_errno;
    3793         [ #  # ]:           0 :         ereport(ERROR,
    3794                 :             :                 (errcode_for_file_access(),
    3795                 :             :                  errmsg("requested WAL segment %s has already been removed",
    3796                 :             :                         filename)));
    3797                 :             :     }
    3798                 :      127759 :     errno = save_errno;
    3799                 :      127759 : }
    3800                 :             : 
    3801                 :             : /*
    3802                 :             :  * Return the last WAL segment removed, or 0 if no segment has been removed
    3803                 :             :  * since startup.
    3804                 :             :  *
    3805                 :             :  * NB: the result can be out of date arbitrarily fast, the caller has to deal
    3806                 :             :  * with that.
    3807                 :             :  */
    3808                 :             : XLogSegNo
    3809                 :        1281 : XLogGetLastRemovedSegno(void)
    3810                 :             : {
    3811                 :             :     XLogSegNo   lastRemovedSegNo;
    3812                 :             : 
    3813                 :        1281 :     SpinLockAcquire(&XLogCtl->info_lck);
    3814                 :        1281 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3815                 :        1281 :     SpinLockRelease(&XLogCtl->info_lck);
    3816                 :             : 
    3817                 :        1281 :     return lastRemovedSegNo;
    3818                 :             : }
    3819                 :             : 
    3820                 :             : /*
    3821                 :             :  * Return the oldest WAL segment on the given TLI that still exists in
    3822                 :             :  * XLOGDIR, or 0 if none.
    3823                 :             :  */
    3824                 :             : XLogSegNo
    3825                 :           7 : XLogGetOldestSegno(TimeLineID tli)
    3826                 :             : {
    3827                 :             :     DIR        *xldir;
    3828                 :             :     struct dirent *xlde;
    3829                 :           7 :     XLogSegNo   oldest_segno = 0;
    3830                 :             : 
    3831                 :           7 :     xldir = AllocateDir(XLOGDIR);
    3832         [ +  + ]:          50 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3833                 :             :     {
    3834                 :             :         TimeLineID  file_tli;
    3835                 :             :         XLogSegNo   file_segno;
    3836                 :             : 
    3837                 :             :         /* Ignore files that are not XLOG segments. */
    3838         [ +  + ]:          43 :         if (!IsXLogFileName(xlde->d_name))
    3839                 :          29 :             continue;
    3840                 :             : 
    3841                 :             :         /* Parse filename to get TLI and segno. */
    3842                 :          14 :         XLogFromFileName(xlde->d_name, &file_tli, &file_segno,
    3843                 :             :                          wal_segment_size);
    3844                 :             : 
    3845                 :             :         /* Ignore anything that's not from the TLI of interest. */
    3846         [ -  + ]:          14 :         if (tli != file_tli)
    3847                 :           0 :             continue;
    3848                 :             : 
    3849                 :             :         /* If it's the oldest so far, update oldest_segno. */
    3850   [ +  +  +  + ]:          14 :         if (oldest_segno == 0 || file_segno < oldest_segno)
    3851                 :           9 :             oldest_segno = file_segno;
    3852                 :             :     }
    3853                 :             : 
    3854                 :           7 :     FreeDir(xldir);
    3855                 :           7 :     return oldest_segno;
    3856                 :             : }
    3857                 :             : 
    3858                 :             : /*
    3859                 :             :  * Update the last removed segno pointer in shared memory, to reflect that the
    3860                 :             :  * given XLOG file has been removed.
    3861                 :             :  */
    3862                 :             : static void
    3863                 :        2739 : UpdateLastRemovedPtr(char *filename)
    3864                 :             : {
    3865                 :             :     uint32      tli;
    3866                 :             :     XLogSegNo   segno;
    3867                 :             : 
    3868                 :        2739 :     XLogFromFileName(filename, &tli, &segno, wal_segment_size);
    3869                 :             : 
    3870                 :        2739 :     SpinLockAcquire(&XLogCtl->info_lck);
    3871         [ +  + ]:        2739 :     if (segno > XLogCtl->lastRemovedSegNo)
    3872                 :        1243 :         XLogCtl->lastRemovedSegNo = segno;
    3873                 :        2739 :     SpinLockRelease(&XLogCtl->info_lck);
    3874                 :        2739 : }
    3875                 :             : 
    3876                 :             : /*
    3877                 :             :  * Remove all temporary log files in pg_wal
    3878                 :             :  *
    3879                 :             :  * This is called at the beginning of recovery after a previous crash,
    3880                 :             :  * at a point where no other processes write fresh WAL data.
    3881                 :             :  */
    3882                 :             : static void
    3883                 :         194 : RemoveTempXlogFiles(void)
    3884                 :             : {
    3885                 :             :     DIR        *xldir;
    3886                 :             :     struct dirent *xlde;
    3887                 :             : 
    3888         [ +  + ]:         194 :     elog(DEBUG2, "removing all temporary WAL segments");
    3889                 :             : 
    3890                 :         194 :     xldir = AllocateDir(XLOGDIR);
    3891         [ +  + ]:        1307 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3892                 :             :     {
    3893                 :             :         char        path[MAXPGPATH];
    3894                 :             : 
    3895         [ +  - ]:        1113 :         if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
    3896                 :        1113 :             continue;
    3897                 :             : 
    3898                 :           0 :         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
    3899                 :           0 :         unlink(path);
    3900         [ #  # ]:           0 :         elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
    3901                 :             :     }
    3902                 :         194 :     FreeDir(xldir);
    3903                 :         194 : }
    3904                 :             : 
    3905                 :             : /*
    3906                 :             :  * Recycle or remove all log files older or equal to passed segno.
    3907                 :             :  *
    3908                 :             :  * endptr is current (or recent) end of xlog, and lastredoptr is the
    3909                 :             :  * redo pointer of the last checkpoint. These are used to determine
    3910                 :             :  * whether we want to recycle rather than delete no-longer-wanted log files.
    3911                 :             :  *
    3912                 :             :  * insertTLI is the current timeline for XLOG insertion. Any recycled
    3913                 :             :  * segments should be reused for this timeline.
    3914                 :             :  */
    3915                 :             : static void
    3916                 :        1944 : RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
    3917                 :             :                    TimeLineID insertTLI)
    3918                 :             : {
    3919                 :             :     DIR        *xldir;
    3920                 :             :     struct dirent *xlde;
    3921                 :             :     char        lastoff[MAXFNAMELEN];
    3922                 :             :     XLogSegNo   endlogSegNo;
    3923                 :             :     XLogSegNo   recycleSegNo;
    3924                 :             : 
    3925                 :             :     /* Initialize info about where to try to recycle to */
    3926                 :        1944 :     XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
    3927                 :        1944 :     recycleSegNo = XLOGfileslop(lastredoptr);
    3928                 :             : 
    3929                 :             :     /*
    3930                 :             :      * Construct a filename of the last segment to be kept. The timeline ID
    3931                 :             :      * doesn't matter, we ignore that in the comparison. (During recovery,
    3932                 :             :      * InsertTimeLineID isn't set, so we can't use that.)
    3933                 :             :      */
    3934                 :        1944 :     XLogFileName(lastoff, 0, segno, wal_segment_size);
    3935                 :             : 
    3936         [ +  + ]:        1944 :     elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
    3937                 :             :          lastoff);
    3938                 :             : 
    3939                 :        1944 :     xldir = AllocateDir(XLOGDIR);
    3940                 :             : 
    3941         [ +  + ]:       46119 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3942                 :             :     {
    3943                 :             :         /* Ignore files that are not XLOG segments */
    3944         [ +  + ]:       44175 :         if (!IsXLogFileName(xlde->d_name) &&
    3945         [ +  + ]:        8232 :             !IsPartialXLogFileName(xlde->d_name))
    3946                 :        8230 :             continue;
    3947                 :             : 
    3948                 :             :         /*
    3949                 :             :          * We ignore the timeline part of the XLOG segment identifiers in
    3950                 :             :          * deciding whether a segment is still needed.  This ensures that we
    3951                 :             :          * won't prematurely remove a segment from a parent timeline. We could
    3952                 :             :          * probably be a little more proactive about removing segments of
    3953                 :             :          * non-parent timelines, but that would be a whole lot more
    3954                 :             :          * complicated.
    3955                 :             :          *
    3956                 :             :          * We use the alphanumeric sorting property of the filenames to decide
    3957                 :             :          * which ones are earlier than the lastoff segment.
    3958                 :             :          */
    3959         [ +  + ]:       35945 :         if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
    3960                 :             :         {
    3961         [ +  + ]:       29194 :             if (XLogArchiveCheckDone(xlde->d_name))
    3962                 :             :             {
    3963                 :             :                 /* Update the last removed location in shared memory first */
    3964                 :        2739 :                 UpdateLastRemovedPtr(xlde->d_name);
    3965                 :             : 
    3966                 :        2739 :                 RemoveXlogFile(xlde, recycleSegNo, &endlogSegNo, insertTLI);
    3967                 :             :             }
    3968                 :             :         }
    3969                 :             :     }
    3970                 :             : 
    3971                 :        1944 :     FreeDir(xldir);
    3972                 :        1944 : }
    3973                 :             : 
    3974                 :             : /*
    3975                 :             :  * Recycle or remove WAL files that are not part of the given timeline's
    3976                 :             :  * history.
    3977                 :             :  *
    3978                 :             :  * This is called during recovery, whenever we switch to follow a new
    3979                 :             :  * timeline, and at the end of recovery when we create a new timeline. We
    3980                 :             :  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
    3981                 :             :  * might be leftover pre-allocated or recycled WAL segments on the old timeline
    3982                 :             :  * that we haven't used yet, and contain garbage. If we just leave them in
    3983                 :             :  * pg_wal, they will eventually be archived, and we can't let that happen.
    3984                 :             :  * Files that belong to our timeline history are valid, because we have
    3985                 :             :  * successfully replayed them, but from others we can't be sure.
    3986                 :             :  *
    3987                 :             :  * 'switchpoint' is the current point in WAL where we switch to new timeline,
    3988                 :             :  * and 'newTLI' is the new timeline we switch to.
    3989                 :             :  */
    3990                 :             : void
    3991                 :          73 : RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
    3992                 :             : {
    3993                 :             :     DIR        *xldir;
    3994                 :             :     struct dirent *xlde;
    3995                 :             :     char        switchseg[MAXFNAMELEN];
    3996                 :             :     XLogSegNo   endLogSegNo;
    3997                 :             :     XLogSegNo   switchLogSegNo;
    3998                 :             :     XLogSegNo   recycleSegNo;
    3999                 :             : 
    4000                 :             :     /*
    4001                 :             :      * Initialize info about where to begin the work.  This will recycle,
    4002                 :             :      * somewhat arbitrarily, 10 future segments.
    4003                 :             :      */
    4004                 :          73 :     XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
    4005                 :          73 :     XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
    4006                 :          73 :     recycleSegNo = endLogSegNo + 10;
    4007                 :             : 
    4008                 :             :     /*
    4009                 :             :      * Construct a filename of the last segment to be kept.
    4010                 :             :      */
    4011                 :          73 :     XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
    4012                 :             : 
    4013         [ +  + ]:          73 :     elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
    4014                 :             :          switchseg);
    4015                 :             : 
    4016                 :          73 :     xldir = AllocateDir(XLOGDIR);
    4017                 :             : 
    4018         [ +  + ]:         691 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4019                 :             :     {
    4020                 :             :         /* Ignore files that are not XLOG segments */
    4021         [ +  + ]:         618 :         if (!IsXLogFileName(xlde->d_name))
    4022                 :         384 :             continue;
    4023                 :             : 
    4024                 :             :         /*
    4025                 :             :          * Remove files that are on a timeline older than the new one we're
    4026                 :             :          * switching to, but with a segment number >= the first segment on the
    4027                 :             :          * new timeline.
    4028                 :             :          */
    4029         [ +  + ]:         234 :         if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
    4030         [ +  + ]:         152 :             strcmp(xlde->d_name + 8, switchseg + 8) > 0)
    4031                 :             :         {
    4032                 :             :             /*
    4033                 :             :              * If the file has already been marked as .ready, however, don't
    4034                 :             :              * remove it yet. It should be OK to remove it - files that are
    4035                 :             :              * not part of our timeline history are not required for recovery
    4036                 :             :              * - but seems safer to let them be archived and removed later.
    4037                 :             :              */
    4038         [ +  - ]:          16 :             if (!XLogArchiveIsReady(xlde->d_name))
    4039                 :          16 :                 RemoveXlogFile(xlde, recycleSegNo, &endLogSegNo, newTLI);
    4040                 :             :         }
    4041                 :             :     }
    4042                 :             : 
    4043                 :          73 :     FreeDir(xldir);
    4044                 :          73 : }
    4045                 :             : 
    4046                 :             : /*
    4047                 :             :  * Recycle or remove a log file that's no longer needed.
    4048                 :             :  *
    4049                 :             :  * segment_de is the dirent structure of the segment to recycle or remove.
    4050                 :             :  * recycleSegNo is the segment number to recycle up to.  endlogSegNo is
    4051                 :             :  * the segment number of the current (or recent) end of WAL.
    4052                 :             :  *
    4053                 :             :  * endlogSegNo gets incremented if the segment is recycled so as it is not
    4054                 :             :  * checked again with future callers of this function.
    4055                 :             :  *
    4056                 :             :  * insertTLI is the current timeline for XLOG insertion. Any recycled segments
    4057                 :             :  * should be used for this timeline.
    4058                 :             :  */
    4059                 :             : static void
    4060                 :        2755 : RemoveXlogFile(const struct dirent *segment_de,
    4061                 :             :                XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
    4062                 :             :                TimeLineID insertTLI)
    4063                 :             : {
    4064                 :             :     char        path[MAXPGPATH];
    4065                 :             : #ifdef WIN32
    4066                 :             :     char        newpath[MAXPGPATH];
    4067                 :             : #endif
    4068                 :        2755 :     const char *segname = segment_de->d_name;
    4069                 :             : 
    4070                 :        2755 :     snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
    4071                 :             : 
    4072                 :             :     /*
    4073                 :             :      * Before deleting the file, see if it can be recycled as a future log
    4074                 :             :      * segment. Only recycle normal files, because we don't want to recycle
    4075                 :             :      * symbolic links pointing to a separate archive directory.
    4076                 :             :      */
    4077         [ +  - ]:        2755 :     if (wal_recycle &&
    4078         [ +  + ]:        2755 :         *endlogSegNo <= recycleSegNo &&
    4079   [ +  +  +  - ]:        3782 :         XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */
    4080         [ +  + ]:        3456 :         get_dirent_type(path, segment_de, false, DEBUG2) == PGFILETYPE_REG &&
    4081                 :        1728 :         InstallXLogFileSegment(endlogSegNo, path,
    4082                 :             :                                true, recycleSegNo, insertTLI))
    4083                 :             :     {
    4084         [ +  + ]:        1564 :         ereport(DEBUG2,
    4085                 :             :                 (errmsg_internal("recycled write-ahead log file \"%s\"",
    4086                 :             :                                  segname)));
    4087                 :        1564 :         CheckpointStats.ckpt_segs_recycled++;
    4088                 :             :         /* Needn't recheck that slot on future iterations */
    4089                 :        1564 :         (*endlogSegNo)++;
    4090                 :             :     }
    4091                 :             :     else
    4092                 :             :     {
    4093                 :             :         /* No need for any more future segments, or recycling failed ... */
    4094                 :             :         int         rc;
    4095                 :             : 
    4096         [ -  + ]:        1191 :         ereport(DEBUG2,
    4097                 :             :                 (errmsg_internal("removing write-ahead log file \"%s\"",
    4098                 :             :                                  segname)));
    4099                 :             : 
    4100                 :             : #ifdef WIN32
    4101                 :             : 
    4102                 :             :         /*
    4103                 :             :          * On Windows, if another process (e.g another backend) holds the file
    4104                 :             :          * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
    4105                 :             :          * will still show up in directory listing until the last handle is
    4106                 :             :          * closed. To avoid confusing the lingering deleted file for a live
    4107                 :             :          * WAL file that needs to be archived, rename it before deleting it.
    4108                 :             :          *
    4109                 :             :          * If another process holds the file open without FILE_SHARE_DELETE
    4110                 :             :          * flag, rename will fail. We'll try again at the next checkpoint.
    4111                 :             :          */
    4112                 :             :         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
    4113                 :             :         if (rename(path, newpath) != 0)
    4114                 :             :         {
    4115                 :             :             ereport(LOG,
    4116                 :             :                     (errcode_for_file_access(),
    4117                 :             :                      errmsg("could not rename file \"%s\": %m",
    4118                 :             :                             path)));
    4119                 :             :             return;
    4120                 :             :         }
    4121                 :             :         rc = durable_unlink(newpath, LOG);
    4122                 :             : #else
    4123                 :        1191 :         rc = durable_unlink(path, LOG);
    4124                 :             : #endif
    4125         [ -  + ]:        1191 :         if (rc != 0)
    4126                 :             :         {
    4127                 :             :             /* Message already logged by durable_unlink() */
    4128                 :           0 :             return;
    4129                 :             :         }
    4130                 :        1191 :         CheckpointStats.ckpt_segs_removed++;
    4131                 :             :     }
    4132                 :             : 
    4133                 :        2755 :     XLogArchiveCleanup(segname);
    4134                 :             : }
    4135                 :             : 
    4136                 :             : /*
    4137                 :             :  * Verify whether pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
    4138                 :             :  * If the latter do not exist, recreate them.
    4139                 :             :  *
    4140                 :             :  * It is not the goal of this function to verify the contents of these
    4141                 :             :  * directories, but to help in cases where someone has performed a cluster
    4142                 :             :  * copy for PITR purposes but omitted pg_wal from the copy.
    4143                 :             :  *
    4144                 :             :  * We could also recreate pg_wal if it doesn't exist, but a deliberate
    4145                 :             :  * policy decision was made not to.  It is fairly common for pg_wal to be
    4146                 :             :  * a symlink, and if that was the DBA's intent then automatically making a
    4147                 :             :  * plain directory would result in degraded performance with no notice.
    4148                 :             :  */
    4149                 :             : static void
    4150                 :        1088 : ValidateXLOGDirectoryStructure(void)
    4151                 :             : {
    4152                 :             :     char        path[MAXPGPATH];
    4153                 :             :     struct stat stat_buf;
    4154                 :             : 
    4155                 :             :     /* Check for pg_wal; if it doesn't exist, error out */
    4156         [ +  - ]:        1088 :     if (stat(XLOGDIR, &stat_buf) != 0 ||
    4157         [ -  + ]:        1088 :         !S_ISDIR(stat_buf.st_mode))
    4158         [ #  # ]:           0 :         ereport(FATAL,
    4159                 :             :                 (errcode_for_file_access(),
    4160                 :             :                  errmsg("required WAL directory \"%s\" does not exist",
    4161                 :             :                         XLOGDIR)));
    4162                 :             : 
    4163                 :             :     /* Check for archive_status */
    4164                 :        1088 :     snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
    4165         [ +  + ]:        1088 :     if (stat(path, &stat_buf) == 0)
    4166                 :             :     {
    4167                 :             :         /* Check for weird cases where it exists but isn't a directory */
    4168         [ -  + ]:        1087 :         if (!S_ISDIR(stat_buf.st_mode))
    4169         [ #  # ]:           0 :             ereport(FATAL,
    4170                 :             :                     (errcode_for_file_access(),
    4171                 :             :                      errmsg("required WAL directory \"%s\" does not exist",
    4172                 :             :                             path)));
    4173                 :             :     }
    4174                 :             :     else
    4175                 :             :     {
    4176         [ +  - ]:           1 :         ereport(LOG,
    4177                 :             :                 (errmsg("creating missing WAL directory \"%s\"", path)));
    4178         [ -  + ]:           1 :         if (MakePGDirectory(path) < 0)
    4179         [ #  # ]:           0 :             ereport(FATAL,
    4180                 :             :                     (errcode_for_file_access(),
    4181                 :             :                      errmsg("could not create missing directory \"%s\": %m",
    4182                 :             :                             path)));
    4183                 :             :     }
    4184                 :             : 
    4185                 :             :     /* Check for summaries */
    4186                 :        1088 :     snprintf(path, MAXPGPATH, XLOGDIR "/summaries");
    4187         [ +  + ]:        1088 :     if (stat(path, &stat_buf) == 0)
    4188                 :             :     {
    4189                 :             :         /* Check for weird cases where it exists but isn't a directory */
    4190         [ -  + ]:        1087 :         if (!S_ISDIR(stat_buf.st_mode))
    4191         [ #  # ]:           0 :             ereport(FATAL,
    4192                 :             :                     (errmsg("required WAL directory \"%s\" does not exist",
    4193                 :             :                             path)));
    4194                 :             :     }
    4195                 :             :     else
    4196                 :             :     {
    4197         [ +  - ]:           1 :         ereport(LOG,
    4198                 :             :                 (errmsg("creating missing WAL directory \"%s\"", path)));
    4199         [ -  + ]:           1 :         if (MakePGDirectory(path) < 0)
    4200         [ #  # ]:           0 :             ereport(FATAL,
    4201                 :             :                     (errmsg("could not create missing directory \"%s\": %m",
    4202                 :             :                             path)));
    4203                 :             :     }
    4204                 :        1088 : }
    4205                 :             : 
    4206                 :             : /*
    4207                 :             :  * Remove previous backup history files.  This also retries creation of
    4208                 :             :  * .ready files for any backup history files for which XLogArchiveNotify
    4209                 :             :  * failed earlier.
    4210                 :             :  */
    4211                 :             : static void
    4212                 :         162 : CleanupBackupHistory(void)
    4213                 :             : {
    4214                 :             :     DIR        *xldir;
    4215                 :             :     struct dirent *xlde;
    4216                 :             :     char        path[MAXPGPATH + sizeof(XLOGDIR)];
    4217                 :             : 
    4218                 :         162 :     xldir = AllocateDir(XLOGDIR);
    4219                 :             : 
    4220         [ +  + ]:        1665 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4221                 :             :     {
    4222         [ +  + ]:        1341 :         if (IsBackupHistoryFileName(xlde->d_name))
    4223                 :             :         {
    4224         [ +  + ]:         171 :             if (XLogArchiveCheckDone(xlde->d_name))
    4225                 :             :             {
    4226         [ +  + ]:         135 :                 elog(DEBUG2, "removing WAL backup history file \"%s\"",
    4227                 :             :                      xlde->d_name);
    4228                 :         135 :                 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
    4229                 :         135 :                 unlink(path);
    4230                 :         135 :                 XLogArchiveCleanup(xlde->d_name);
    4231                 :             :             }
    4232                 :             :         }
    4233                 :             :     }
    4234                 :             : 
    4235                 :         162 :     FreeDir(xldir);
    4236                 :         162 : }
    4237                 :             : 
    4238                 :             : /*
    4239                 :             :  * I/O routines for pg_control
    4240                 :             :  *
    4241                 :             :  * *ControlFile is a buffer in shared memory that holds an image of the
    4242                 :             :  * contents of pg_control.  WriteControlFile() initializes pg_control
    4243                 :             :  * given a preloaded buffer, ReadControlFile() loads the buffer from
    4244                 :             :  * the pg_control file (during postmaster or standalone-backend startup),
    4245                 :             :  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
    4246                 :             :  * InitControlFile() fills the buffer with initial values.
    4247                 :             :  *
    4248                 :             :  * For simplicity, WriteControlFile() initializes the fields of pg_control
    4249                 :             :  * that are related to checking backend/database compatibility, and
    4250                 :             :  * ReadControlFile() verifies they are correct.  We could split out the
    4251                 :             :  * I/O and compatibility-check functions, but there seems no need currently.
    4252                 :             :  */
    4253                 :             : 
    4254                 :             : static void
    4255                 :          57 : InitControlFile(uint64 sysidentifier, uint32 data_checksum_version)
    4256                 :             : {
    4257                 :             :     char        mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
    4258                 :             : 
    4259                 :             :     /*
    4260                 :             :      * Generate a random nonce. This is used for authentication requests that
    4261                 :             :      * will fail because the user does not exist. The nonce is used to create
    4262                 :             :      * a genuine-looking password challenge for the non-existent user, in lieu
    4263                 :             :      * of an actual stored password.
    4264                 :             :      */
    4265         [ -  + ]:          57 :     if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
    4266         [ #  # ]:           0 :         ereport(PANIC,
    4267                 :             :                 (errcode(ERRCODE_INTERNAL_ERROR),
    4268                 :             :                  errmsg("could not generate secret authorization token")));
    4269                 :             : 
    4270                 :          57 :     memset(ControlFile, 0, sizeof(ControlFileData));
    4271                 :             :     /* Initialize pg_control status fields */
    4272                 :          57 :     ControlFile->system_identifier = sysidentifier;
    4273                 :          57 :     memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
    4274                 :          57 :     ControlFile->state = DB_SHUTDOWNED;
    4275                 :          57 :     ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
    4276                 :             : 
    4277                 :             :     /* Set important parameter values for use when replaying WAL */
    4278                 :          57 :     ControlFile->MaxConnections = MaxConnections;
    4279                 :          57 :     ControlFile->max_worker_processes = max_worker_processes;
    4280                 :          57 :     ControlFile->max_wal_senders = max_wal_senders;
    4281                 :          57 :     ControlFile->max_prepared_xacts = max_prepared_xacts;
    4282                 :          57 :     ControlFile->max_locks_per_xact = max_locks_per_xact;
    4283                 :          57 :     ControlFile->wal_level = wal_level;
    4284                 :          57 :     ControlFile->wal_log_hints = wal_log_hints;
    4285                 :          57 :     ControlFile->track_commit_timestamp = track_commit_timestamp;
    4286                 :          57 :     ControlFile->data_checksum_version = data_checksum_version;
    4287                 :             : 
    4288                 :             :     /*
    4289                 :             :      * Set the data_checksum_version value into XLogCtl, which is where all
    4290                 :             :      * processes get the current value from.
    4291                 :             :      */
    4292                 :          57 :     XLogCtl->data_checksum_version = data_checksum_version;
    4293                 :          57 : }
    4294                 :             : 
    4295                 :             : static void
    4296                 :          57 : WriteControlFile(void)
    4297                 :             : {
    4298                 :             :     int         fd;
    4299                 :             :     char        buffer[PG_CONTROL_FILE_SIZE];   /* need not be aligned */
    4300                 :             : 
    4301                 :             :     /*
    4302                 :             :      * Initialize version and compatibility-check fields
    4303                 :             :      */
    4304                 :          57 :     ControlFile->pg_control_version = PG_CONTROL_VERSION;
    4305                 :          57 :     ControlFile->catalog_version_no = CATALOG_VERSION_NO;
    4306                 :             : 
    4307                 :          57 :     ControlFile->maxAlign = MAXIMUM_ALIGNOF;
    4308                 :          57 :     ControlFile->floatFormat = FLOATFORMAT_VALUE;
    4309                 :             : 
    4310                 :          57 :     ControlFile->blcksz = BLCKSZ;
    4311                 :          57 :     ControlFile->relseg_size = RELSEG_SIZE;
    4312                 :          57 :     ControlFile->slru_pages_per_segment = SLRU_PAGES_PER_SEGMENT;
    4313                 :          57 :     ControlFile->xlog_blcksz = XLOG_BLCKSZ;
    4314                 :          57 :     ControlFile->xlog_seg_size = wal_segment_size;
    4315                 :             : 
    4316                 :          57 :     ControlFile->nameDataLen = NAMEDATALEN;
    4317                 :          57 :     ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
    4318                 :             : 
    4319                 :          57 :     ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
    4320                 :          57 :     ControlFile->loblksize = LOBLKSIZE;
    4321                 :             : 
    4322                 :          57 :     ControlFile->float8ByVal = true; /* vestigial */
    4323                 :             : 
    4324                 :             :     /*
    4325                 :             :      * Initialize the default 'char' signedness.
    4326                 :             :      *
    4327                 :             :      * The signedness of the char type is implementation-defined. For instance
    4328                 :             :      * on x86 architecture CPUs, the char data type is typically treated as
    4329                 :             :      * signed by default, whereas on aarch architecture CPUs, it is typically
    4330                 :             :      * treated as unsigned by default. In v17 or earlier, we accidentally let
    4331                 :             :      * C implementation signedness affect persistent data. This led to
    4332                 :             :      * inconsistent results when comparing char data across different
    4333                 :             :      * platforms.
    4334                 :             :      *
    4335                 :             :      * This flag can be used as a hint to ensure consistent behavior for
    4336                 :             :      * pre-v18 data files that store data sorted by the 'char' type on disk,
    4337                 :             :      * especially in cross-platform replication scenarios.
    4338                 :             :      *
    4339                 :             :      * Newly created database clusters unconditionally set the default char
    4340                 :             :      * signedness to true. pg_upgrade changes this flag for clusters that were
    4341                 :             :      * initialized on signedness=false platforms. As a result,
    4342                 :             :      * signedness=false setting will become rare over time. If we had known
    4343                 :             :      * about this problem during the last development cycle that forced initdb
    4344                 :             :      * (v8.3), we would have made all clusters signed or all clusters
    4345                 :             :      * unsigned. Making pg_upgrade the only source of signedness=false will
    4346                 :             :      * cause the population of database clusters to converge toward that
    4347                 :             :      * retrospective ideal.
    4348                 :             :      */
    4349                 :          57 :     ControlFile->default_char_signedness = true;
    4350                 :             : 
    4351                 :             :     /* Contents are protected with a CRC */
    4352                 :          57 :     INIT_CRC32C(ControlFile->crc);
    4353                 :          57 :     COMP_CRC32C(ControlFile->crc,
    4354                 :             :                 ControlFile,
    4355                 :             :                 offsetof(ControlFileData, crc));
    4356                 :          57 :     FIN_CRC32C(ControlFile->crc);
    4357                 :             : 
    4358                 :             :     /*
    4359                 :             :      * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
    4360                 :             :      * the excess over sizeof(ControlFileData).  This reduces the odds of
    4361                 :             :      * premature-EOF errors when reading pg_control.  We'll still fail when we
    4362                 :             :      * check the contents of the file, but hopefully with a more specific
    4363                 :             :      * error than "couldn't read pg_control".
    4364                 :             :      */
    4365                 :          57 :     memset(buffer, 0, PG_CONTROL_FILE_SIZE);
    4366                 :          57 :     memcpy(buffer, ControlFile, sizeof(ControlFileData));
    4367                 :             : 
    4368                 :          57 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4369                 :             :                        O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
    4370         [ -  + ]:          57 :     if (fd < 0)
    4371         [ #  # ]:           0 :         ereport(PANIC,
    4372                 :             :                 (errcode_for_file_access(),
    4373                 :             :                  errmsg("could not create file \"%s\": %m",
    4374                 :             :                         XLOG_CONTROL_FILE)));
    4375                 :             : 
    4376                 :          57 :     errno = 0;
    4377                 :          57 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
    4378         [ -  + ]:          57 :     if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
    4379                 :             :     {
    4380                 :             :         /* if write didn't set errno, assume problem is no disk space */
    4381         [ #  # ]:           0 :         if (errno == 0)
    4382                 :           0 :             errno = ENOSPC;
    4383         [ #  # ]:           0 :         ereport(PANIC,
    4384                 :             :                 (errcode_for_file_access(),
    4385                 :             :                  errmsg("could not write to file \"%s\": %m",
    4386                 :             :                         XLOG_CONTROL_FILE)));
    4387                 :             :     }
    4388                 :          57 :     pgstat_report_wait_end();
    4389                 :             : 
    4390                 :          57 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
    4391         [ -  + ]:          57 :     if (pg_fsync(fd) != 0)
    4392         [ #  # ]:           0 :         ereport(PANIC,
    4393                 :             :                 (errcode_for_file_access(),
    4394                 :             :                  errmsg("could not fsync file \"%s\": %m",
    4395                 :             :                         XLOG_CONTROL_FILE)));
    4396                 :          57 :     pgstat_report_wait_end();
    4397                 :             : 
    4398         [ -  + ]:          57 :     if (close(fd) != 0)
    4399         [ #  # ]:           0 :         ereport(PANIC,
    4400                 :             :                 (errcode_for_file_access(),
    4401                 :             :                  errmsg("could not close file \"%s\": %m",
    4402                 :             :                         XLOG_CONTROL_FILE)));
    4403                 :          57 : }
    4404                 :             : 
    4405                 :             : static void
    4406                 :        1148 : ReadControlFile(void)
    4407                 :             : {
    4408                 :             :     pg_crc32c   crc;
    4409                 :             :     int         fd;
    4410                 :             :     char        wal_segsz_str[20];
    4411                 :             :     int         r;
    4412                 :             : 
    4413                 :             :     /*
    4414                 :             :      * Read data...
    4415                 :             :      */
    4416                 :        1148 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4417                 :             :                        O_RDWR | PG_BINARY);
    4418         [ -  + ]:        1148 :     if (fd < 0)
    4419         [ #  # ]:           0 :         ereport(PANIC,
    4420                 :             :                 (errcode_for_file_access(),
    4421                 :             :                  errmsg("could not open file \"%s\": %m",
    4422                 :             :                         XLOG_CONTROL_FILE)));
    4423                 :             : 
    4424                 :        1148 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
    4425                 :        1148 :     r = read(fd, ControlFile, sizeof(ControlFileData));
    4426         [ -  + ]:        1148 :     if (r != sizeof(ControlFileData))
    4427                 :             :     {
    4428         [ #  # ]:           0 :         if (r < 0)
    4429         [ #  # ]:           0 :             ereport(PANIC,
    4430                 :             :                     (errcode_for_file_access(),
    4431                 :             :                      errmsg("could not read file \"%s\": %m",
    4432                 :             :                             XLOG_CONTROL_FILE)));
    4433                 :             :         else
    4434         [ #  # ]:           0 :             ereport(PANIC,
    4435                 :             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    4436                 :             :                      errmsg("could not read file \"%s\": read %d of %zu",
    4437                 :             :                             XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
    4438                 :             :     }
    4439                 :        1148 :     pgstat_report_wait_end();
    4440                 :             : 
    4441                 :        1148 :     close(fd);
    4442                 :             : 
    4443                 :             :     /*
    4444                 :             :      * Check for expected pg_control format version.  If this is wrong, the
    4445                 :             :      * CRC check will likely fail because we'll be checking the wrong number
    4446                 :             :      * of bytes.  Complaining about wrong version will probably be more
    4447                 :             :      * enlightening than complaining about wrong CRC.
    4448                 :             :      */
    4449                 :             : 
    4450   [ -  +  -  -  :        1148 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
                   -  - ]
    4451         [ #  # ]:           0 :         ereport(FATAL,
    4452                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4453                 :             :                  errmsg("database files are incompatible with server"),
    4454                 :             :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
    4455                 :             :                            " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
    4456                 :             :                            ControlFile->pg_control_version, ControlFile->pg_control_version,
    4457                 :             :                            PG_CONTROL_VERSION, PG_CONTROL_VERSION),
    4458                 :             :                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
    4459                 :             : 
    4460         [ -  + ]:        1148 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
    4461         [ #  # ]:           0 :         ereport(FATAL,
    4462                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4463                 :             :                  errmsg("database files are incompatible with server"),
    4464                 :             :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
    4465                 :             :                            " but the server was compiled with PG_CONTROL_VERSION %d.",
    4466                 :             :                            ControlFile->pg_control_version, PG_CONTROL_VERSION),
    4467                 :             :                  errhint("It looks like you need to initdb.")));
    4468                 :             : 
    4469                 :             :     /* Now check the CRC. */
    4470                 :        1148 :     INIT_CRC32C(crc);
    4471                 :        1148 :     COMP_CRC32C(crc,
    4472                 :             :                 ControlFile,
    4473                 :             :                 offsetof(ControlFileData, crc));
    4474                 :        1148 :     FIN_CRC32C(crc);
    4475                 :             : 
    4476         [ -  + ]:        1148 :     if (!EQ_CRC32C(crc, ControlFile->crc))
    4477         [ #  # ]:           0 :         ereport(FATAL,
    4478                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4479                 :             :                  errmsg("incorrect checksum in control file")));
    4480                 :             : 
    4481                 :             :     /*
    4482                 :             :      * Do compatibility checking immediately.  If the database isn't
    4483                 :             :      * compatible with the backend executable, we want to abort before we can
    4484                 :             :      * possibly do any damage.
    4485                 :             :      */
    4486         [ -  + ]:        1148 :     if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
    4487         [ #  # ]:           0 :         ereport(FATAL,
    4488                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4489                 :             :                  errmsg("database files are incompatible with server"),
    4490                 :             :         /* translator: %s is a variable name and %d is its value */
    4491                 :             :                  errdetail("The database cluster was initialized with %s %d,"
    4492                 :             :                            " but the server was compiled with %s %d.",
    4493                 :             :                            "CATALOG_VERSION_NO", ControlFile->catalog_version_no,
    4494                 :             :                            "CATALOG_VERSION_NO", CATALOG_VERSION_NO),
    4495                 :             :                  errhint("It looks like you need to initdb.")));
    4496         [ -  + ]:        1148 :     if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
    4497         [ #  # ]:           0 :         ereport(FATAL,
    4498                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4499                 :             :                  errmsg("database files are incompatible with server"),
    4500                 :             :         /* translator: %s is a variable name and %d is its value */
    4501                 :             :                  errdetail("The database cluster was initialized with %s %d,"
    4502                 :             :                            " but the server was compiled with %s %d.",
    4503                 :             :                            "MAXALIGN", ControlFile->maxAlign,
    4504                 :             :                            "MAXALIGN", MAXIMUM_ALIGNOF),
    4505                 :             :                  errhint("It looks like you need to initdb.")));
    4506         [ -  + ]:        1148 :     if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
    4507         [ #  # ]:           0 :         ereport(FATAL,
    4508                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4509                 :             :                  errmsg("database files are incompatible with server"),
    4510                 :             :                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
    4511                 :             :                  errhint("It looks like you need to initdb.")));
    4512         [ -  + ]:        1148 :     if (ControlFile->blcksz != BLCKSZ)
    4513         [ #  # ]:           0 :         ereport(FATAL,
    4514                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4515                 :             :                  errmsg("database files are incompatible with server"),
    4516                 :             :         /* translator: %s is a variable name and %d is its value */
    4517                 :             :                  errdetail("The database cluster was initialized with %s %d,"
    4518                 :             :                            " but the server was compiled with %s %d.",
    4519                 :             :                            "BLCKSZ", ControlFile->blcksz,
    4520                 :             :                            "BLCKSZ", BLCKSZ),
    4521                 :             :                  errhint("It looks like you need to recompile or initdb.")));
    4522         [ -  + ]:        1148 :     if (ControlFile->relseg_size != RELSEG_SIZE)
    4523         [ #  # ]:           0 :         ereport(FATAL,
    4524                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4525                 :             :                  errmsg("database files are incompatible with server"),
    4526                 :             :         /* translator: %s is a variable name and %d is its value */
    4527                 :             :                  errdetail("The database cluster was initialized with %s %d,"
    4528                 :             :                            " but the server was compiled with %s %d.",
    4529                 :             :                            "RELSEG_SIZE", ControlFile->relseg_size,
    4530                 :             :                            "RELSEG_SIZE", RELSEG_SIZE),
    4531                 :             :                  errhint("It looks like you need to recompile or initdb.")));
    4532         [ -  + ]:        1148 :     if (ControlFile->slru_pages_per_segment != SLRU_PAGES_PER_SEGMENT)
    4533         [ #  # ]:           0 :         ereport(FATAL,
    4534                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4535                 :             :                  errmsg("database files are incompatible with server"),
    4536                 :             :         /* translator: %s is a variable name and %d is its value */
    4537                 :             :                  errdetail("The database cluster was initialized with %s %d,"
    4538                 :             :                            " but the server was compiled with %s %d.",
    4539                 :             :                            "SLRU_PAGES_PER_SEGMENT", ControlFile->slru_pages_per_segment,
    4540                 :             :                            "SLRU_PAGES_PER_SEGMENT", SLRU_PAGES_PER_SEGMENT),
    4541                 :             :                  errhint("It looks like you need to recompile or initdb.")));
    4542         [ -  + ]:        1148 :     if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
    4543         [ #  # ]:           0 :         ereport(FATAL,
    4544                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4545                 :             :                  errmsg("database files are incompatible with server"),
    4546                 :             :         /* translator: %s is a variable name and %d is its value */
    4547                 :             :                  errdetail("The database cluster was initialized with %s %d,"
    4548                 :             :                            " but the server was compiled with %s %d.",
    4549                 :             :                            "XLOG_BLCKSZ", ControlFile->xlog_blcksz,
    4550                 :             :                            "XLOG_BLCKSZ", XLOG_BLCKSZ),
    4551                 :             :                  errhint("It looks like you need to recompile or initdb.")));
    4552         [ -  + ]:        1148 :     if (ControlFile->nameDataLen != NAMEDATALEN)
    4553         [ #  # ]:           0 :         ereport(FATAL,
    4554                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4555                 :             :                  errmsg("database files are incompatible with server"),
    4556                 :             :         /* translator: %s is a variable name and %d is its value */
    4557                 :             :                  errdetail("The database cluster was initialized with %s %d,"
    4558                 :             :                            " but the server was compiled with %s %d.",
    4559                 :             :                            "NAMEDATALEN", ControlFile->nameDataLen,
    4560                 :             :                            "NAMEDATALEN", NAMEDATALEN),
    4561                 :             :                  errhint("It looks like you need to recompile or initdb.")));
    4562         [ -  + ]:        1148 :     if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
    4563         [ #  # ]:           0 :         ereport(FATAL,
    4564                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4565                 :             :                  errmsg("database files are incompatible with server"),
    4566                 :             :         /* translator: %s is a variable name and %d is its value */
    4567                 :             :                  errdetail("The database cluster was initialized with %s %d,"
    4568                 :             :                            " but the server was compiled with %s %d.",
    4569                 :             :                            "INDEX_MAX_KEYS", ControlFile->indexMaxKeys,
    4570                 :             :                            "INDEX_MAX_KEYS", INDEX_MAX_KEYS),
    4571                 :             :                  errhint("It looks like you need to recompile or initdb.")));
    4572         [ -  + ]:        1148 :     if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
    4573         [ #  # ]:           0 :         ereport(FATAL,
    4574                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4575                 :             :                  errmsg("database files are incompatible with server"),
    4576                 :             :         /* translator: %s is a variable name and %d is its value */
    4577                 :             :                  errdetail("The database cluster was initialized with %s %d,"
    4578                 :             :                            " but the server was compiled with %s %d.",
    4579                 :             :                            "TOAST_MAX_CHUNK_SIZE", ControlFile->toast_max_chunk_size,
    4580                 :             :                            "TOAST_MAX_CHUNK_SIZE", (int) TOAST_MAX_CHUNK_SIZE),
    4581                 :             :                  errhint("It looks like you need to recompile or initdb.")));
    4582         [ -  + ]:        1148 :     if (ControlFile->loblksize != LOBLKSIZE)
    4583         [ #  # ]:           0 :         ereport(FATAL,
    4584                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    4585                 :             :                  errmsg("database files are incompatible with server"),
    4586                 :             :         /* translator: %s is a variable name and %d is its value */
    4587                 :             :                  errdetail("The database cluster was initialized with %s %d,"
    4588                 :             :                            " but the server was compiled with %s %d.",
    4589                 :             :                            "LOBLKSIZE", ControlFile->loblksize,
    4590                 :             :                            "LOBLKSIZE", (int) LOBLKSIZE),
    4591                 :             :                  errhint("It looks like you need to recompile or initdb.")));
    4592                 :             : 
    4593                 :             :     Assert(ControlFile->float8ByVal);    /* vestigial, not worth an error msg */
    4594                 :             : 
    4595                 :        1148 :     wal_segment_size = ControlFile->xlog_seg_size;
    4596                 :             : 
    4597   [ +  -  +  -  :        1148 :     if (!IsValidWalSegSize(wal_segment_size))
             +  -  -  + ]
    4598         [ #  # ]:           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4599                 :             :                         errmsg_plural("invalid WAL segment size in control file (%d byte)",
    4600                 :             :                                       "invalid WAL segment size in control file (%d bytes)",
    4601                 :             :                                       wal_segment_size,
    4602                 :             :                                       wal_segment_size),
    4603                 :             :                         errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.")));
    4604                 :             : 
    4605                 :        1148 :     snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
    4606                 :        1148 :     SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
    4607                 :             :                     PGC_S_DYNAMIC_DEFAULT);
    4608                 :             : 
    4609                 :             :     /* check and update variables dependent on wal_segment_size */
    4610         [ -  + ]:        1148 :     if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
    4611         [ #  # ]:           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4612                 :             :         /* translator: both %s are GUC names */
    4613                 :             :                         errmsg("\"%s\" must be at least twice \"%s\"",
    4614                 :             :                                "min_wal_size", "wal_segment_size")));
    4615                 :             : 
    4616         [ -  + ]:        1148 :     if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
    4617         [ #  # ]:           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4618                 :             :         /* translator: both %s are GUC names */
    4619                 :             :                         errmsg("\"%s\" must be at least twice \"%s\"",
    4620                 :             :                                "max_wal_size", "wal_segment_size")));
    4621                 :             : 
    4622                 :        1148 :     UsableBytesInSegment =
    4623                 :        1148 :         (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
    4624                 :             :         (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
    4625                 :             : 
    4626                 :        1148 :     CalculateCheckpointSegments();
    4627                 :        1148 : }
    4628                 :             : 
    4629                 :             : /*
    4630                 :             :  * Utility wrapper to update the control file.  Note that the control
    4631                 :             :  * file gets flushed.
    4632                 :             :  */
    4633                 :             : static void
    4634                 :       10469 : UpdateControlFile(void)
    4635                 :             : {
    4636                 :       10469 :     update_controlfile(DataDir, ControlFile, true);
    4637                 :       10469 : }
    4638                 :             : 
    4639                 :             : /*
    4640                 :             :  * Returns the unique system identifier from control file.
    4641                 :             :  */
    4642                 :             : uint64
    4643                 :        1557 : GetSystemIdentifier(void)
    4644                 :             : {
    4645                 :             :     Assert(ControlFile != NULL);
    4646                 :        1557 :     return ControlFile->system_identifier;
    4647                 :             : }
    4648                 :             : 
    4649                 :             : /*
    4650                 :             :  * Returns the random nonce from control file.
    4651                 :             :  */
    4652                 :             : char *
    4653                 :           2 : GetMockAuthenticationNonce(void)
    4654                 :             : {
    4655                 :             :     Assert(ControlFile != NULL);
    4656                 :           2 :     return ControlFile->mock_authentication_nonce;
    4657                 :             : }
    4658                 :             : 
    4659                 :             : /*
    4660                 :             :  * DataChecksumsNeedWrite
    4661                 :             :  *      Returns whether data checksums must be written or not
    4662                 :             :  *
    4663                 :             :  * Returns true if data checksums are enabled, or are in the process of being
    4664                 :             :  * enabled. During "inprogress-on" and "inprogress-off" states checksums must
    4665                 :             :  * be written even though they are not verified (see datachecksum_state.c for
    4666                 :             :  * a longer discussion).
    4667                 :             :  *
    4668                 :             :  * This function is intended for callsites which are about to write a data page
    4669                 :             :  * to storage, and need to know whether to re-calculate the checksum for the
    4670                 :             :  * page header. Calling this function must be performed as close to the write
    4671                 :             :  * operation as possible to keep the critical section short.
    4672                 :             :  */
    4673                 :             : bool
    4674                 :      860437 : DataChecksumsNeedWrite(void)
    4675                 :             : {
    4676                 :      965948 :     return (LocalDataChecksumState == PG_DATA_CHECKSUM_VERSION ||
    4677   [ +  +  +  + ]:      924411 :             LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_ON ||
    4678         [ +  + ]:       63974 :             LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_OFF);
    4679                 :             : }
    4680                 :             : 
    4681                 :             : 
    4682                 :             : bool
    4683                 :           7 : DataChecksumsOff(void)
    4684                 :             : {
    4685                 :             :     bool        ret;
    4686                 :             : 
    4687                 :           7 :     SpinLockAcquire(&XLogCtl->info_lck);
    4688                 :           7 :     ret = (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_OFF);
    4689                 :           7 :     SpinLockRelease(&XLogCtl->info_lck);
    4690                 :             : 
    4691                 :           7 :     return ret;
    4692                 :             : }
    4693                 :             : 
    4694                 :             : bool
    4695                 :          11 : DataChecksumsOn(void)
    4696                 :             : {
    4697                 :             :     bool        ret;
    4698                 :             : 
    4699                 :          11 :     SpinLockAcquire(&XLogCtl->info_lck);
    4700                 :          11 :     ret = (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_VERSION);
    4701                 :          11 :     SpinLockRelease(&XLogCtl->info_lck);
    4702                 :             : 
    4703                 :          11 :     return ret;
    4704                 :             : }
    4705                 :             : 
    4706                 :             : bool
    4707                 :         166 : DataChecksumsInProgressOn(void)
    4708                 :             : {
    4709                 :             :     bool        ret;
    4710                 :             : 
    4711                 :         166 :     SpinLockAcquire(&XLogCtl->info_lck);
    4712                 :         166 :     ret = (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON);
    4713                 :         166 :     SpinLockRelease(&XLogCtl->info_lck);
    4714                 :             : 
    4715                 :         166 :     return ret;
    4716                 :             : }
    4717                 :             : 
    4718                 :             : /*
    4719                 :             :  * DataChecksumsNeedVerify
    4720                 :             :  *      Returns whether data checksums must be verified or not
    4721                 :             :  *
    4722                 :             :  * Data checksums are only verified if they are fully enabled in the cluster.
    4723                 :             :  * During the "inprogress-on" and "inprogress-off" states they are only
    4724                 :             :  * updated, not verified (see datachecksum_state.c for a longer discussion).
    4725                 :             :  *
    4726                 :             :  * This function is intended for callsites which have read data and are about
    4727                 :             :  * to perform checksum validation based on the result of this.  Calling this
    4728                 :             :  * function must be performed as close to the validation call as possible to
    4729                 :             :  * keep the critical section short. This is in order to protect against time of
    4730                 :             :  * check/time of use situations around data checksum validation.
    4731                 :             :  */
    4732                 :             : bool
    4733                 :     2676020 : DataChecksumsNeedVerify(void)
    4734                 :             : {
    4735                 :     2676020 :     return (LocalDataChecksumState == PG_DATA_CHECKSUM_VERSION);
    4736                 :             : }
    4737                 :             : 
    4738                 :             : /*
    4739                 :             :  * SetDataChecksumsOnInProgress
    4740                 :             :  *      Sets the data checksum state to "inprogress-on" to enable checksums
    4741                 :             :  *
    4742                 :             :  * To start the process of enabling data checksums in a running cluster the
    4743                 :             :  * data_checksum_version state must be changed to "inprogress-on". See
    4744                 :             :  * SetDataChecksumsOn below for a description on how this state change works.
    4745                 :             :  * This function blocks until all backends in the cluster have acknowledged the
    4746                 :             :  * state transition.
    4747                 :             :  */
    4748                 :             : void
    4749                 :           9 : SetDataChecksumsOnInProgress(void)
    4750                 :             : {
    4751                 :             :     uint64      barrier;
    4752                 :             : 
    4753                 :             :     /*
    4754                 :             :      * The state transition is performed in a critical section with
    4755                 :             :      * checkpoints held off to provide crash safety.
    4756                 :             :      */
    4757                 :           9 :     START_CRIT_SECTION();
    4758                 :           9 :     MyProc->delayChkptFlags |= DELAY_CHKPT_START;
    4759                 :             : 
    4760                 :           9 :     XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON);
    4761                 :             : 
    4762                 :           9 :     SpinLockAcquire(&XLogCtl->info_lck);
    4763                 :           9 :     XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON;
    4764                 :           9 :     SpinLockRelease(&XLogCtl->info_lck);
    4765                 :             : 
    4766                 :           9 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    4767                 :           9 :     ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON;
    4768                 :           9 :     UpdateControlFile();
    4769                 :           9 :     LWLockRelease(ControlFileLock);
    4770                 :             : 
    4771                 :           9 :     barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
    4772                 :             : 
    4773                 :           9 :     MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
    4774                 :           9 :     END_CRIT_SECTION();
    4775                 :             : 
    4776                 :           9 :     WaitForProcSignalBarrier(barrier);
    4777                 :           9 : }
    4778                 :             : 
    4779                 :             : /*
    4780                 :             :  * SetDataChecksumsOn
    4781                 :             :  *      Set data checksums state to 'on' cluster-wide
    4782                 :             :  *
    4783                 :             :  * Enabling data checksums is performed using two barriers, the first one to
    4784                 :             :  * set the state to "inprogress-on" (done by SetDataChecksumsOnInProgress())
    4785                 :             :  * and the second one to set the state to "on" (done here). Below is a short
    4786                 :             :  * description of the processing, a more detailed write-up can be found in
    4787                 :             :  * datachecksum_state.c.
    4788                 :             :  *
    4789                 :             :  * To start the process of enabling data checksums in a running cluster the
    4790                 :             :  * data_checksum_version state must be changed to "inprogress-on".  This state
    4791                 :             :  * requires data checksums to be written but not verified. This ensures that
    4792                 :             :  * all data pages can be checksummed without the risk of false negatives in
    4793                 :             :  * validation during the process.  When all existing pages are guaranteed to
    4794                 :             :  * have checksums, and all new pages will be initiated with checksums, the
    4795                 :             :  * state can be changed to "on". Once the state is "on" checksums will be both
    4796                 :             :  * written and verified.
    4797                 :             :  *
    4798                 :             :  * This function blocks until all backends in the cluster have acknowledged the
    4799                 :             :  * state transition.
    4800                 :             :  */
    4801                 :             : void
    4802                 :           7 : SetDataChecksumsOn(void)
    4803                 :             : {
    4804                 :             :     uint64      barrier;
    4805                 :             : 
    4806                 :           7 :     SpinLockAcquire(&XLogCtl->info_lck);
    4807                 :             : 
    4808                 :             :     /*
    4809                 :             :      * The only allowed state transition to "on" is from "inprogress-on" since
    4810                 :             :      * that state ensures that all pages will have data checksums written. Any
    4811                 :             :      * other attempted state transition is likely due to a programmer error.
    4812                 :             :      */
    4813         [ -  + ]:           7 :     if (XLogCtl->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON)
    4814                 :             :     {
    4815                 :           0 :         SpinLockRelease(&XLogCtl->info_lck);
    4816         [ #  # ]:           0 :         elog(WARNING,
    4817                 :             :              "cannot set data checksums to \"on\", current state is not \"inprogress-on\", disabling");
    4818                 :           0 :         SetDataChecksumsOff();
    4819                 :           0 :         return;
    4820                 :             :     }
    4821                 :             : 
    4822                 :           7 :     SpinLockRelease(&XLogCtl->info_lck);
    4823                 :             : 
    4824                 :           7 :     INJECTION_POINT("datachecksums-enable-checksums-delay", NULL);
    4825                 :           7 :     START_CRIT_SECTION();
    4826                 :           7 :     MyProc->delayChkptFlags |= DELAY_CHKPT_START;
    4827                 :             : 
    4828                 :           7 :     XLogChecksums(PG_DATA_CHECKSUM_VERSION);
    4829                 :             : 
    4830                 :           7 :     SpinLockAcquire(&XLogCtl->info_lck);
    4831                 :           7 :     XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
    4832                 :           7 :     SpinLockRelease(&XLogCtl->info_lck);
    4833                 :             : 
    4834                 :             :     /*
    4835                 :             :      * Update the controlfile before waiting since if we have an immediate
    4836                 :             :      * shutdown while waiting we want to come back up with checksums enabled.
    4837                 :             :      */
    4838                 :           7 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    4839                 :           7 :     ControlFile->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
    4840                 :           7 :     UpdateControlFile();
    4841                 :           7 :     LWLockRelease(ControlFileLock);
    4842                 :             : 
    4843                 :           7 :     barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
    4844                 :             : 
    4845                 :           7 :     MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
    4846                 :           7 :     END_CRIT_SECTION();
    4847                 :             : 
    4848                 :           7 :     RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
    4849                 :           7 :     WaitForProcSignalBarrier(barrier);
    4850                 :             : }
    4851                 :             : 
    4852                 :             : /*
    4853                 :             :  * SetDataChecksumsOff
    4854                 :             :  *      Disables data checksums cluster-wide
    4855                 :             :  *
    4856                 :             :  * Disabling data checksums must be performed with two sets of barriers, each
    4857                 :             :  * carrying a different state. The state is first set to "inprogress-off"
    4858                 :             :  * during which checksums are still written but not verified. This ensures that
    4859                 :             :  * backends which have yet to observe the state change from "on" won't get
    4860                 :             :  * validation errors on concurrently modified pages. Once all backends have
    4861                 :             :  * changed to "inprogress-off", the barrier for moving to "off" can be emitted.
    4862                 :             :  * This function blocks until all backends in the cluster have acknowledged the
    4863                 :             :  * state transition.
    4864                 :             :  */
    4865                 :             : void
    4866                 :           7 : SetDataChecksumsOff(void)
    4867                 :             : {
    4868                 :             :     uint64      barrier;
    4869                 :             : 
    4870                 :           7 :     SpinLockAcquire(&XLogCtl->info_lck);
    4871                 :             : 
    4872                 :             :     /* If data checksums are already disabled there is nothing to do */
    4873         [ -  + ]:           7 :     if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_OFF)
    4874                 :             :     {
    4875                 :           0 :         SpinLockRelease(&XLogCtl->info_lck);
    4876                 :           0 :         return;
    4877                 :             :     }
    4878                 :             : 
    4879                 :             :     /*
    4880                 :             :      * If data checksums are currently enabled, or in the process of being
    4881                 :             :      * enabled, we first transition to the "inprogress-off" state during which
    4882                 :             :      * backends continue to write checksums without verifying them. When all
    4883                 :             :      * backends are in "inprogress-off" the next transition to "off" can be
    4884                 :             :      * performed, after which all data checksum processing is disabled.
    4885                 :             :      */
    4886         [ +  + ]:           7 :     if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_VERSION ||
    4887         [ +  - ]:           2 :         XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON)
    4888                 :             :     {
    4889                 :           7 :         SpinLockRelease(&XLogCtl->info_lck);
    4890                 :             : 
    4891                 :           7 :         START_CRIT_SECTION();
    4892                 :           7 :         MyProc->delayChkptFlags |= DELAY_CHKPT_START;
    4893                 :             : 
    4894                 :           7 :         XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_OFF);
    4895                 :             : 
    4896                 :           7 :         SpinLockAcquire(&XLogCtl->info_lck);
    4897                 :           7 :         XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF;
    4898                 :           7 :         SpinLockRelease(&XLogCtl->info_lck);
    4899                 :             : 
    4900                 :           7 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    4901                 :           7 :         ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF;
    4902                 :           7 :         UpdateControlFile();
    4903                 :           7 :         LWLockRelease(ControlFileLock);
    4904                 :             : 
    4905                 :           7 :         barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
    4906                 :             : 
    4907                 :           7 :         MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
    4908                 :           7 :         END_CRIT_SECTION();
    4909                 :             : 
    4910                 :           7 :         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
    4911                 :           7 :         WaitForProcSignalBarrier(barrier);
    4912                 :             : 
    4913                 :             :         /*
    4914                 :             :          * At this point we know that no backends are verifying data checksums
    4915                 :             :          * during reading. Next, we can safely move to state "off" to also
    4916                 :             :          * stop writing checksums.
    4917                 :             :          */
    4918                 :             :     }
    4919                 :             :     else
    4920                 :             :     {
    4921                 :             :         /*
    4922                 :             :          * Ending up here implies that the checksums state is "inprogress-off"
    4923                 :             :          * and we can transition directly to "off" from there.
    4924                 :             :          */
    4925                 :           0 :         SpinLockRelease(&XLogCtl->info_lck);
    4926                 :             :     }
    4927                 :             : 
    4928                 :           7 :     START_CRIT_SECTION();
    4929                 :             :     /* Ensure that we don't incur a checkpoint during disabling checksums */
    4930                 :           7 :     MyProc->delayChkptFlags |= DELAY_CHKPT_START;
    4931                 :             : 
    4932                 :           7 :     XLogChecksums(PG_DATA_CHECKSUM_OFF);
    4933                 :             : 
    4934                 :           7 :     SpinLockAcquire(&XLogCtl->info_lck);
    4935                 :           7 :     XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_OFF;
    4936                 :           7 :     SpinLockRelease(&XLogCtl->info_lck);
    4937                 :             : 
    4938                 :           7 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    4939                 :           7 :     ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF;
    4940                 :           7 :     UpdateControlFile();
    4941                 :           7 :     LWLockRelease(ControlFileLock);
    4942                 :             : 
    4943                 :           7 :     barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
    4944                 :             : 
    4945                 :           7 :     MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
    4946                 :           7 :     END_CRIT_SECTION();
    4947                 :             : 
    4948                 :           7 :     RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
    4949                 :           7 :     WaitForProcSignalBarrier(barrier);
    4950                 :             : }
    4951                 :             : 
    4952                 :             : /*
    4953                 :             :  * InitLocalDataChecksumState
    4954                 :             :  *
    4955                 :             :  * Set up backend local caches of controldata variables which may change at
    4956                 :             :  * any point during runtime and thus require special cased locking. So far
    4957                 :             :  * this only applies to data_checksum_version, but it's intended to be general
    4958                 :             :  * purpose enough to handle future cases.
    4959                 :             :  */
    4960                 :             : void
    4961                 :       24490 : InitLocalDataChecksumState(void)
    4962                 :             : {
    4963                 :             :     Assert(InterruptHoldoffCount > 0);
    4964                 :       24490 :     SpinLockAcquire(&XLogCtl->info_lck);
    4965                 :       24490 :     SetLocalDataChecksumState(XLogCtl->data_checksum_version);
    4966                 :       24490 :     SpinLockRelease(&XLogCtl->info_lck);
    4967                 :       24490 : }
    4968                 :             : 
    4969                 :             : void
    4970                 :       27816 : SetLocalDataChecksumState(uint32 data_checksum_version)
    4971                 :             : {
    4972                 :       27816 :     LocalDataChecksumState = data_checksum_version;
    4973                 :             : 
    4974                 :       27816 :     data_checksums = data_checksum_version;
    4975                 :       27816 : }
    4976                 :             : 
    4977                 :             : /* guc hook */
    4978                 :             : const char *
    4979                 :        1930 : show_data_checksums(void)
    4980                 :             : {
    4981                 :        1930 :     return get_checksum_state_string(LocalDataChecksumState);
    4982                 :             : }
    4983                 :             : 
    4984                 :             : /*
    4985                 :             :  * Return true if the cluster was initialized on a platform where the
    4986                 :             :  * default signedness of char is "signed". This function exists for code
    4987                 :             :  * that deals with pre-v18 data files that store data sorted by the 'char'
    4988                 :             :  * type on disk (e.g., GIN and GiST indexes). See the comments in
    4989                 :             :  * WriteControlFile() for details.
    4990                 :             :  */
    4991                 :             : bool
    4992                 :       89903 : GetDefaultCharSignedness(void)
    4993                 :             : {
    4994                 :       89903 :     return ControlFile->default_char_signedness;
    4995                 :             : }
    4996                 :             : 
    4997                 :             : /*
    4998                 :             :  * Returns a fake LSN for unlogged relations.
    4999                 :             :  *
    5000                 :             :  * Each call generates an LSN that is greater than any previous value
    5001                 :             :  * returned. The current counter value is saved and restored across clean
    5002                 :             :  * shutdowns, but like unlogged relations, does not survive a crash. This can
    5003                 :             :  * be used in lieu of real LSN values returned by XLogInsert, if you need an
    5004                 :             :  * LSN-like increasing sequence of numbers without writing any WAL.
    5005                 :             :  */
    5006                 :             : XLogRecPtr
    5007                 :      202661 : GetFakeLSNForUnloggedRel(void)
    5008                 :             : {
    5009                 :      202661 :     return pg_atomic_fetch_add_u64(&XLogCtl->unloggedLSN, 1);
    5010                 :             : }
    5011                 :             : 
    5012                 :             : /*
    5013                 :             :  * Auto-tune the number of XLOG buffers.
    5014                 :             :  *
    5015                 :             :  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
    5016                 :             :  * a maximum of one XLOG segment (there is little reason to think that more
    5017                 :             :  * is helpful, at least so long as we force an fsync when switching log files)
    5018                 :             :  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
    5019                 :             :  * 9.1, when auto-tuning was added).
    5020                 :             :  *
    5021                 :             :  * This should not be called until NBuffers has received its final value.
    5022                 :             :  */
    5023                 :             : static int
    5024                 :        1244 : XLOGChooseNumBuffers(void)
    5025                 :             : {
    5026                 :             :     int         xbuffers;
    5027                 :             : 
    5028                 :        1244 :     xbuffers = NBuffers / 32;
    5029         [ +  + ]:        1244 :     if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
    5030                 :          24 :         xbuffers = (wal_segment_size / XLOG_BLCKSZ);
    5031         [ +  + ]:        1244 :     if (xbuffers < 8)
    5032                 :         474 :         xbuffers = 8;
    5033                 :        1244 :     return xbuffers;
    5034                 :             : }
    5035                 :             : 
    5036                 :             : /*
    5037                 :             :  * GUC check_hook for wal_buffers
    5038                 :             :  */
    5039                 :             : bool
    5040                 :        2535 : check_wal_buffers(int *newval, void **extra, GucSource source)
    5041                 :             : {
    5042                 :             :     /*
    5043                 :             :      * -1 indicates a request for auto-tune.
    5044                 :             :      */
    5045         [ +  + ]:        2535 :     if (*newval == -1)
    5046                 :             :     {
    5047                 :             :         /*
    5048                 :             :          * If we haven't yet changed the boot_val default of -1, just let it
    5049                 :             :          * be.  We'll fix it when XLOGShmemRequest is called.
    5050                 :             :          */
    5051         [ +  - ]:        1291 :         if (XLOGbuffers == -1)
    5052                 :        1291 :             return true;
    5053                 :             : 
    5054                 :             :         /* Otherwise, substitute the auto-tune value */
    5055                 :           0 :         *newval = XLOGChooseNumBuffers();
    5056                 :             :     }
    5057                 :             : 
    5058                 :             :     /*
    5059                 :             :      * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
    5060                 :             :      * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
    5061                 :             :      * the case, we just silently treat such values as a request for the
    5062                 :             :      * minimum.  (We could throw an error instead, but that doesn't seem very
    5063                 :             :      * helpful.)
    5064                 :             :      */
    5065         [ -  + ]:        1244 :     if (*newval < 4)
    5066                 :           0 :         *newval = 4;
    5067                 :             : 
    5068                 :        1244 :     return true;
    5069                 :             : }
    5070                 :             : 
    5071                 :             : /*
    5072                 :             :  * GUC check_hook for wal_consistency_checking
    5073                 :             :  */
    5074                 :             : bool
    5075                 :        2277 : check_wal_consistency_checking(char **newval, void **extra, GucSource source)
    5076                 :             : {
    5077                 :             :     char       *rawstring;
    5078                 :             :     List       *elemlist;
    5079                 :             :     ListCell   *l;
    5080                 :             :     bool        newwalconsistency[RM_MAX_ID + 1];
    5081                 :             : 
    5082                 :             :     /* Initialize the array */
    5083   [ +  -  +  -  :       75141 :     MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool));
          +  -  +  -  +  
                      + ]
    5084                 :             : 
    5085                 :             :     /* Need a modifiable copy of string */
    5086                 :        2277 :     rawstring = pstrdup(*newval);
    5087                 :             : 
    5088                 :             :     /* Parse string into list of identifiers */
    5089         [ -  + ]:        2277 :     if (!SplitIdentifierString(rawstring, ',', &elemlist))
    5090                 :             :     {
    5091                 :             :         /* syntax error in list */
    5092                 :           0 :         GUC_check_errdetail("List syntax is invalid.");
    5093                 :           0 :         pfree(rawstring);
    5094                 :           0 :         list_free(elemlist);
    5095                 :           0 :         return false;
    5096                 :             :     }
    5097                 :             : 
    5098   [ +  +  +  +  :        2773 :     foreach(l, elemlist)
                   +  + ]
    5099                 :             :     {
    5100                 :         496 :         char       *tok = (char *) lfirst(l);
    5101                 :             :         int         rmid;
    5102                 :             : 
    5103                 :             :         /* Check for 'all'. */
    5104         [ +  + ]:         496 :         if (pg_strcasecmp(tok, "all") == 0)
    5105                 :             :         {
    5106         [ +  + ]:      126958 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    5107   [ +  +  +  + ]:      126464 :                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL)
    5108                 :        4940 :                     newwalconsistency[rmid] = true;
    5109                 :             :         }
    5110                 :             :         else
    5111                 :             :         {
    5112                 :             :             /* Check if the token matches any known resource manager. */
    5113                 :           2 :             bool        found = false;
    5114                 :             : 
    5115         [ +  - ]:          36 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    5116                 :             :             {
    5117   [ +  -  +  +  :          54 :                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL &&
                   +  + ]
    5118                 :          18 :                     pg_strcasecmp(tok, GetRmgr(rmid).rm_name) == 0)
    5119                 :             :                 {
    5120                 :           2 :                     newwalconsistency[rmid] = true;
    5121                 :           2 :                     found = true;
    5122                 :           2 :                     break;
    5123                 :             :                 }
    5124                 :             :             }
    5125         [ -  + ]:           2 :             if (!found)
    5126                 :             :             {
    5127                 :             :                 /*
    5128                 :             :                  * During startup, it might be a not-yet-loaded custom
    5129                 :             :                  * resource manager.  Defer checking until
    5130                 :             :                  * InitializeWalConsistencyChecking().
    5131                 :             :                  */
    5132         [ #  # ]:           0 :                 if (!process_shared_preload_libraries_done)
    5133                 :             :                 {
    5134                 :           0 :                     check_wal_consistency_checking_deferred = true;
    5135                 :             :                 }
    5136                 :             :                 else
    5137                 :             :                 {
    5138                 :           0 :                     GUC_check_errdetail("Unrecognized key word: \"%s\".", tok);
    5139                 :           0 :                     pfree(rawstring);
    5140                 :           0 :                     list_free(elemlist);
    5141                 :           0 :                     return false;
    5142                 :             :                 }
    5143                 :             :             }
    5144                 :             :         }
    5145                 :             :     }
    5146                 :             : 
    5147                 :        2277 :     pfree(rawstring);
    5148                 :        2277 :     list_free(elemlist);
    5149                 :             : 
    5150                 :             :     /* assign new value */
    5151                 :        2277 :     *extra = guc_malloc(LOG, (RM_MAX_ID + 1) * sizeof(bool));
    5152         [ -  + ]:        2277 :     if (!*extra)
    5153                 :           0 :         return false;
    5154                 :        2277 :     memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool));
    5155                 :        2277 :     return true;
    5156                 :             : }
    5157                 :             : 
    5158                 :             : /*
    5159                 :             :  * GUC assign_hook for wal_consistency_checking
    5160                 :             :  */
    5161                 :             : void
    5162                 :        2276 : assign_wal_consistency_checking(const char *newval, void *extra)
    5163                 :             : {
    5164                 :             :     /*
    5165                 :             :      * If some checks were deferred, it's possible that the checks will fail
    5166                 :             :      * later during InitializeWalConsistencyChecking(). But in that case, the
    5167                 :             :      * postmaster will exit anyway, so it's safe to proceed with the
    5168                 :             :      * assignment.
    5169                 :             :      *
    5170                 :             :      * Any built-in resource managers specified are assigned immediately,
    5171                 :             :      * which affects WAL created before shared_preload_libraries are
    5172                 :             :      * processed. Any custom resource managers specified won't be assigned
    5173                 :             :      * until after shared_preload_libraries are processed, but that's OK
    5174                 :             :      * because WAL for a custom resource manager can't be written before the
    5175                 :             :      * module is loaded anyway.
    5176                 :             :      */
    5177                 :        2276 :     wal_consistency_checking = extra;
    5178                 :        2276 : }
    5179                 :             : 
    5180                 :             : /*
    5181                 :             :  * InitializeWalConsistencyChecking: run after loading custom resource managers
    5182                 :             :  *
    5183                 :             :  * If any unknown resource managers were specified in the
    5184                 :             :  * wal_consistency_checking GUC, processing was deferred.  Now that
    5185                 :             :  * shared_preload_libraries have been loaded, process wal_consistency_checking
    5186                 :             :  * again.
    5187                 :             :  */
    5188                 :             : void
    5189                 :        1073 : InitializeWalConsistencyChecking(void)
    5190                 :             : {
    5191                 :             :     Assert(process_shared_preload_libraries_done);
    5192                 :             : 
    5193         [ -  + ]:        1073 :     if (check_wal_consistency_checking_deferred)
    5194                 :             :     {
    5195                 :             :         struct config_generic *guc;
    5196                 :             : 
    5197                 :           0 :         guc = find_option("wal_consistency_checking", false, false, ERROR);
    5198                 :             : 
    5199                 :           0 :         check_wal_consistency_checking_deferred = false;
    5200                 :             : 
    5201                 :           0 :         set_config_option_ext("wal_consistency_checking",
    5202                 :             :                               wal_consistency_checking_string,
    5203                 :             :                               guc->scontext, guc->source, guc->srole,
    5204                 :             :                               GUC_ACTION_SET, true, ERROR, false);
    5205                 :             : 
    5206                 :             :         /* checking should not be deferred again */
    5207                 :             :         Assert(!check_wal_consistency_checking_deferred);
    5208                 :             :     }
    5209                 :        1073 : }
    5210                 :             : 
    5211                 :             : /*
    5212                 :             :  * GUC show_hook for archive_command
    5213                 :             :  */
    5214                 :             : const char *
    5215                 :        1926 : show_archive_command(void)
    5216                 :             : {
    5217         [ +  + ]:        1926 :     if (XLogArchivingActive())
    5218                 :         109 :         return XLogArchiveCommand;
    5219                 :             :     else
    5220                 :        1817 :         return "(disabled)";
    5221                 :             : }
    5222                 :             : 
    5223                 :             : /*
    5224                 :             :  * GUC show_hook for in_hot_standby
    5225                 :             :  */
    5226                 :             : const char *
    5227                 :       17535 : show_in_hot_standby(void)
    5228                 :             : {
    5229                 :             :     /*
    5230                 :             :      * We display the actual state based on shared memory, so that this GUC
    5231                 :             :      * reports up-to-date state if examined intra-query.  The underlying
    5232                 :             :      * variable (in_hot_standby_guc) changes only when we transmit a new value
    5233                 :             :      * to the client.
    5234                 :             :      */
    5235         [ +  + ]:       17535 :     return RecoveryInProgress() ? "on" : "off";
    5236                 :             : }
    5237                 :             : 
    5238                 :             : /*
    5239                 :             :  * GUC show_hook for effective_wal_level
    5240                 :             :  */
    5241                 :             : const char *
    5242                 :        1965 : show_effective_wal_level(void)
    5243                 :             : {
    5244         [ +  + ]:        1965 :     if (wal_level == WAL_LEVEL_MINIMAL)
    5245                 :         239 :         return "minimal";
    5246                 :             : 
    5247                 :             :     /*
    5248                 :             :      * During recovery, effective_wal_level reflects the primary's
    5249                 :             :      * configuration rather than the local wal_level value.
    5250                 :             :      */
    5251         [ +  + ]:        1726 :     if (RecoveryInProgress())
    5252         [ +  + ]:          31 :         return IsXLogLogicalInfoEnabled() ? "logical" : "replica";
    5253                 :             : 
    5254   [ +  +  +  + ]:        1695 :     return XLogLogicalInfoActive() ? "logical" : "replica";
    5255                 :             : }
    5256                 :             : 
    5257                 :             : /*
    5258                 :             :  * Read the control file, set respective GUCs.
    5259                 :             :  *
    5260                 :             :  * This is to be called during startup, including a crash recovery cycle,
    5261                 :             :  * unless in bootstrap mode, where no control file yet exists.  As there's no
    5262                 :             :  * usable shared memory yet (its sizing can depend on the contents of the
    5263                 :             :  * control file!), first store the contents in local memory. XLOGShmemInit()
    5264                 :             :  * will then copy it to shared memory later.
    5265                 :             :  *
    5266                 :             :  * reset just controls whether previous contents are to be expected (in the
    5267                 :             :  * reset case, there's a dangling pointer into old shared memory), or not.
    5268                 :             :  */
    5269                 :             : void
    5270                 :        1091 : LocalProcessControlFile(bool reset)
    5271                 :             : {
    5272                 :             :     Assert(reset || ControlFile == NULL);
    5273                 :        1091 :     LocalControlFile = palloc_object(ControlFileData);
    5274                 :        1091 :     ControlFile = LocalControlFile;
    5275                 :        1091 :     ReadControlFile();
    5276                 :        1091 :     SetLocalDataChecksumState(ControlFile->data_checksum_version);
    5277                 :        1091 : }
    5278                 :             : 
    5279                 :             : /*
    5280                 :             :  * Get the wal_level from the control file. For a standby, this value should be
    5281                 :             :  * considered as its active wal_level, because it may be different from what
    5282                 :             :  * was originally configured on standby.
    5283                 :             :  */
    5284                 :             : WalLevel
    5285                 :           0 : GetActiveWalLevelOnStandby(void)
    5286                 :             : {
    5287                 :           0 :     return ControlFile->wal_level;
    5288                 :             : }
    5289                 :             : 
    5290                 :             : /*
    5291                 :             :  * Register shared memory for XLOG.
    5292                 :             :  */
    5293                 :             : static void
    5294                 :        1249 : XLOGShmemRequest(void *arg)
    5295                 :             : {
    5296                 :             :     Size        size;
    5297                 :             : 
    5298                 :             :     /*
    5299                 :             :      * If the value of wal_buffers is -1, use the preferred auto-tune value.
    5300                 :             :      * This isn't an amazingly clean place to do this, but we must wait till
    5301                 :             :      * NBuffers has received its final value, and must do it before using the
    5302                 :             :      * value of XLOGbuffers to do anything important.
    5303                 :             :      *
    5304                 :             :      * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
    5305                 :             :      * However, if the DBA explicitly set wal_buffers = -1 in the config file,
    5306                 :             :      * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force
    5307                 :             :      * the matter with PGC_S_OVERRIDE.
    5308                 :             :      */
    5309         [ +  + ]:        1249 :     if (XLOGbuffers == -1)
    5310                 :             :     {
    5311                 :             :         char        buf[32];
    5312                 :             : 
    5313                 :        1244 :         snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
    5314                 :        1244 :         SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
    5315                 :             :                         PGC_S_DYNAMIC_DEFAULT);
    5316         [ -  + ]:        1244 :         if (XLOGbuffers == -1)  /* failed to apply it? */
    5317                 :           0 :             SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
    5318                 :             :                             PGC_S_OVERRIDE);
    5319                 :             :     }
    5320                 :             :     Assert(XLOGbuffers > 0);
    5321                 :             : 
    5322                 :             :     /* XLogCtl */
    5323                 :        1249 :     size = sizeof(XLogCtlData);
    5324                 :             : 
    5325                 :             :     /* WAL insertion locks, plus alignment */
    5326                 :        1249 :     size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
    5327                 :             :     /* xlblocks array */
    5328                 :        1249 :     size = add_size(size, mul_size(sizeof(pg_atomic_uint64), XLOGbuffers));
    5329                 :             :     /* extra alignment padding for XLOG I/O buffers */
    5330                 :        1249 :     size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
    5331                 :             :     /* and the buffers themselves */
    5332                 :        1249 :     size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
    5333                 :             : 
    5334                 :        1249 :     ShmemRequestStruct(.name = "XLOG Ctl",
    5335                 :             :                        .size = size,
    5336                 :             :                        .ptr = (void **) &XLogCtl,
    5337                 :             :         );
    5338                 :        1249 :     ShmemRequestStruct(.name = "Control File",
    5339                 :             :                        .size = sizeof(ControlFileData),
    5340                 :             :                        .ptr = (void **) &ControlFile,
    5341                 :             :         );
    5342                 :        1249 : }
    5343                 :             : 
    5344                 :             : /*
    5345                 :             :  * XLOGShmemInit - initialize the XLogCtl shared memory area.
    5346                 :             :  */
    5347                 :             : static void
    5348                 :        1246 : XLOGShmemInit(void *arg)
    5349                 :             : {
    5350                 :             :     char       *allocptr;
    5351                 :             :     int         i;
    5352                 :             : 
    5353                 :             : #ifdef WAL_DEBUG
    5354                 :             : 
    5355                 :             :     /*
    5356                 :             :      * Create a memory context for WAL debugging that's exempt from the normal
    5357                 :             :      * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
    5358                 :             :      * an allocation fails, but wal_debug is not for production use anyway.
    5359                 :             :      */
    5360                 :             :     if (walDebugCxt == NULL)
    5361                 :             :     {
    5362                 :             :         walDebugCxt = AllocSetContextCreate(TopMemoryContext,
    5363                 :             :                                             "WAL Debug",
    5364                 :             :                                             ALLOCSET_DEFAULT_SIZES);
    5365                 :             :         MemoryContextAllowInCriticalSection(walDebugCxt, true);
    5366                 :             :     }
    5367                 :             : #endif
    5368                 :             : 
    5369                 :        1246 :     memset(XLogCtl, 0, sizeof(XLogCtlData));
    5370                 :             : 
    5371                 :             :     /*
    5372                 :             :      * Already have read control file locally, unless in bootstrap mode. Move
    5373                 :             :      * contents into shared memory.
    5374                 :             :      */
    5375         [ +  + ]:        1246 :     if (LocalControlFile)
    5376                 :             :     {
    5377                 :        1075 :         memcpy(ControlFile, LocalControlFile, sizeof(ControlFileData));
    5378                 :        1075 :         pfree(LocalControlFile);
    5379                 :        1075 :         LocalControlFile = NULL;
    5380                 :             :     }
    5381                 :             : 
    5382                 :             :     /*
    5383                 :             :      * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
    5384                 :             :      * multiple of the alignment for same, so no extra alignment padding is
    5385                 :             :      * needed here.
    5386                 :             :      */
    5387                 :        1246 :     allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
    5388                 :        1246 :     XLogCtl->xlblocks = (pg_atomic_uint64 *) allocptr;
    5389                 :        1246 :     allocptr += sizeof(pg_atomic_uint64) * XLOGbuffers;
    5390                 :             : 
    5391         [ +  + ]:      362157 :     for (i = 0; i < XLOGbuffers; i++)
    5392                 :             :     {
    5393                 :      360911 :         pg_atomic_init_u64(&XLogCtl->xlblocks[i], InvalidXLogRecPtr);
    5394                 :             :     }
    5395                 :             : 
    5396                 :             :     /* WAL insertion locks. Ensure they're aligned to the full padded size */
    5397                 :        1246 :     allocptr += sizeof(WALInsertLockPadded) -
    5398                 :        1246 :         ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
    5399                 :        1246 :     WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
    5400                 :             :         (WALInsertLockPadded *) allocptr;
    5401                 :        1246 :     allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
    5402                 :             : 
    5403         [ +  + ]:       11214 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    5404                 :             :     {
    5405                 :        9968 :         LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
    5406                 :        9968 :         pg_atomic_init_u64(&WALInsertLocks[i].l.insertingAt, InvalidXLogRecPtr);
    5407                 :        9968 :         WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
    5408                 :             :     }
    5409                 :             : 
    5410                 :             :     /*
    5411                 :             :      * Align the start of the page buffers to a full xlog block size boundary.
    5412                 :             :      * This simplifies some calculations in XLOG insertion. It is also
    5413                 :             :      * required for O_DIRECT.
    5414                 :             :      */
    5415                 :        1246 :     allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
    5416                 :        1246 :     XLogCtl->pages = allocptr;
    5417                 :        1246 :     memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
    5418                 :             : 
    5419                 :             :     /*
    5420                 :             :      * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
    5421                 :             :      * in additional info.)
    5422                 :             :      */
    5423                 :        1246 :     XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
    5424                 :        1246 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
    5425                 :        1246 :     XLogCtl->InstallXLogFileSegmentActive = false;
    5426                 :        1246 :     XLogCtl->WalWriterSleeping = false;
    5427                 :             : 
    5428                 :             :     /* Use the checksum info from control file */
    5429                 :        1246 :     XLogCtl->data_checksum_version = ControlFile->data_checksum_version;
    5430                 :        1246 :     SetLocalDataChecksumState(XLogCtl->data_checksum_version);
    5431                 :             : 
    5432                 :        1246 :     SpinLockInit(&XLogCtl->Insert.insertpos_lck);
    5433                 :        1246 :     SpinLockInit(&XLogCtl->info_lck);
    5434                 :        1246 :     pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr);
    5435                 :        1246 :     pg_atomic_init_u64(&XLogCtl->logWriteResult, InvalidXLogRecPtr);
    5436                 :        1246 :     pg_atomic_init_u64(&XLogCtl->logFlushResult, InvalidXLogRecPtr);
    5437                 :        1246 :     pg_atomic_init_u64(&XLogCtl->unloggedLSN, InvalidXLogRecPtr);
    5438                 :        1246 : }
    5439                 :             : 
    5440                 :             : /*
    5441                 :             :  * XLOGShmemAttach - re-establish WALInsertLocks pointer after attaching.
    5442                 :             :  */
    5443                 :             : static void
    5444                 :           0 : XLOGShmemAttach(void *arg)
    5445                 :             : {
    5446                 :           0 :     WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
    5447                 :           0 : }
    5448                 :             : 
    5449                 :             : /*
    5450                 :             :  * This func must be called ONCE on system install.  It creates pg_control
    5451                 :             :  * and the initial XLOG segment.
    5452                 :             :  */
    5453                 :             : void
    5454                 :          57 : BootStrapXLOG(uint32 data_checksum_version)
    5455                 :             : {
    5456                 :             :     CheckPoint  checkPoint;
    5457                 :             :     PGAlignedXLogBlock buffer;
    5458                 :             :     XLogPageHeader page;
    5459                 :             :     XLogLongPageHeader longpage;
    5460                 :             :     XLogRecord *record;
    5461                 :             :     char       *recptr;
    5462                 :             :     uint64      sysidentifier;
    5463                 :             :     struct timeval tv;
    5464                 :             :     pg_crc32c   crc;
    5465                 :             : 
    5466                 :             :     /* allow ordinary WAL segment creation, like StartupXLOG() would */
    5467                 :          57 :     SetInstallXLogFileSegmentActive();
    5468                 :             : 
    5469                 :             :     /*
    5470                 :             :      * Select a hopefully-unique system identifier code for this installation.
    5471                 :             :      * We use the result of gettimeofday(), including the fractional seconds
    5472                 :             :      * field, as being about as unique as we can easily get.  (Think not to
    5473                 :             :      * use random(), since it hasn't been seeded and there's no portable way
    5474                 :             :      * to seed it other than the system clock value...)  The upper half of the
    5475                 :             :      * uint64 value is just the tv_sec part, while the lower half contains the
    5476                 :             :      * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
    5477                 :             :      * PID for a little extra uniqueness.  A person knowing this encoding can
    5478                 :             :      * determine the initialization time of the installation, which could
    5479                 :             :      * perhaps be useful sometimes.
    5480                 :             :      */
    5481                 :          57 :     gettimeofday(&tv, NULL);
    5482                 :          57 :     sysidentifier = ((uint64) tv.tv_sec) << 32;
    5483                 :          57 :     sysidentifier |= ((uint64) tv.tv_usec) << 12;
    5484                 :          57 :     sysidentifier |= getpid() & 0xFFF;
    5485                 :             : 
    5486                 :          57 :     memset(&buffer, 0, sizeof buffer);
    5487                 :          57 :     page = (XLogPageHeader) &buffer;
    5488                 :             : 
    5489                 :             :     /*
    5490                 :             :      * Set up information for the initial checkpoint record
    5491                 :             :      *
    5492                 :             :      * The initial checkpoint record is written to the beginning of the WAL
    5493                 :             :      * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
    5494                 :             :      * used, so that we can use 0/0 to mean "before any valid WAL segment".
    5495                 :             :      */
    5496                 :          57 :     checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
    5497                 :          57 :     checkPoint.ThisTimeLineID = BootstrapTimeLineID;
    5498                 :          57 :     checkPoint.PrevTimeLineID = BootstrapTimeLineID;
    5499                 :          57 :     checkPoint.fullPageWrites = fullPageWrites;
    5500                 :          57 :     checkPoint.logicalDecodingEnabled = (wal_level == WAL_LEVEL_LOGICAL);
    5501                 :          57 :     checkPoint.wal_level = wal_level;
    5502                 :             :     checkPoint.nextXid =
    5503                 :          57 :         FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
    5504                 :          57 :     checkPoint.nextOid = FirstGenbkiObjectId;
    5505                 :          57 :     checkPoint.nextMulti = FirstMultiXactId;
    5506                 :          57 :     checkPoint.nextMultiOffset = 1;
    5507                 :          57 :     checkPoint.oldestXid = FirstNormalTransactionId;
    5508                 :          57 :     checkPoint.oldestXidDB = Template1DbOid;
    5509                 :          57 :     checkPoint.oldestMulti = FirstMultiXactId;
    5510                 :          57 :     checkPoint.oldestMultiDB = Template1DbOid;
    5511                 :          57 :     checkPoint.oldestCommitTsXid = InvalidTransactionId;
    5512                 :          57 :     checkPoint.newestCommitTsXid = InvalidTransactionId;
    5513                 :          57 :     checkPoint.time = (pg_time_t) time(NULL);
    5514                 :          57 :     checkPoint.oldestActiveXid = InvalidTransactionId;
    5515                 :          57 :     checkPoint.dataChecksumState = data_checksum_version;
    5516                 :             : 
    5517                 :          57 :     TransamVariables->nextXid = checkPoint.nextXid;
    5518                 :          57 :     TransamVariables->nextOid = checkPoint.nextOid;
    5519                 :          57 :     TransamVariables->oidCount = 0;
    5520                 :          57 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    5521                 :          57 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    5522                 :          57 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    5523                 :          57 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
    5524                 :          57 :     SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
    5525                 :             : 
    5526                 :             :     /* Set up the XLOG page header */
    5527                 :          57 :     page->xlp_magic = XLOG_PAGE_MAGIC;
    5528                 :          57 :     page->xlp_info = XLP_LONG_HEADER;
    5529                 :          57 :     page->xlp_tli = BootstrapTimeLineID;
    5530                 :          57 :     page->xlp_pageaddr = wal_segment_size;
    5531                 :          57 :     longpage = (XLogLongPageHeader) page;
    5532                 :          57 :     longpage->xlp_sysid = sysidentifier;
    5533                 :          57 :     longpage->xlp_seg_size = wal_segment_size;
    5534                 :          57 :     longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    5535                 :             : 
    5536                 :             :     /* Insert the initial checkpoint record */
    5537                 :          57 :     recptr = ((char *) page + SizeOfXLogLongPHD);
    5538                 :          57 :     record = (XLogRecord *) recptr;
    5539                 :          57 :     record->xl_prev = InvalidXLogRecPtr;
    5540                 :          57 :     record->xl_xid = InvalidTransactionId;
    5541                 :          57 :     record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
    5542                 :          57 :     record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
    5543                 :          57 :     record->xl_rmid = RM_XLOG_ID;
    5544                 :          57 :     recptr += SizeOfXLogRecord;
    5545                 :             :     /* fill the XLogRecordDataHeaderShort struct */
    5546                 :          57 :     *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
    5547                 :          57 :     *(recptr++) = sizeof(checkPoint);
    5548                 :          57 :     memcpy(recptr, &checkPoint, sizeof(checkPoint));
    5549                 :          57 :     recptr += sizeof(checkPoint);
    5550                 :             :     Assert(recptr - (char *) record == record->xl_tot_len);
    5551                 :             : 
    5552                 :          57 :     INIT_CRC32C(crc);
    5553                 :          57 :     COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
    5554                 :          57 :     COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
    5555                 :          57 :     FIN_CRC32C(crc);
    5556                 :          57 :     record->xl_crc = crc;
    5557                 :             : 
    5558                 :             :     /* Create first XLOG segment file */
    5559                 :          57 :     openLogTLI = BootstrapTimeLineID;
    5560                 :          57 :     openLogFile = XLogFileInit(1, BootstrapTimeLineID);
    5561                 :             : 
    5562                 :             :     /*
    5563                 :             :      * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
    5564                 :             :      * close the file again in a moment.
    5565                 :             :      */
    5566                 :             : 
    5567                 :             :     /* Write the first page with the initial record */
    5568                 :          57 :     errno = 0;
    5569                 :          57 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
    5570         [ -  + ]:          57 :     if (write(openLogFile, &buffer, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    5571                 :             :     {
    5572                 :             :         /* if write didn't set errno, assume problem is no disk space */
    5573         [ #  # ]:           0 :         if (errno == 0)
    5574                 :           0 :             errno = ENOSPC;
    5575         [ #  # ]:           0 :         ereport(PANIC,
    5576                 :             :                 (errcode_for_file_access(),
    5577                 :             :                  errmsg("could not write bootstrap write-ahead log file: %m")));
    5578                 :             :     }
    5579                 :          57 :     pgstat_report_wait_end();
    5580                 :             : 
    5581                 :          57 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
    5582         [ -  + ]:          57 :     if (pg_fsync(openLogFile) != 0)
    5583         [ #  # ]:           0 :         ereport(PANIC,
    5584                 :             :                 (errcode_for_file_access(),
    5585                 :             :                  errmsg("could not fsync bootstrap write-ahead log file: %m")));
    5586                 :          57 :     pgstat_report_wait_end();
    5587                 :             : 
    5588         [ -  + ]:          57 :     if (close(openLogFile) != 0)
    5589         [ #  # ]:           0 :         ereport(PANIC,
    5590                 :             :                 (errcode_for_file_access(),
    5591                 :             :                  errmsg("could not close bootstrap write-ahead log file: %m")));
    5592                 :             : 
    5593                 :          57 :     openLogFile = -1;
    5594                 :             : 
    5595                 :             :     /* Now create pg_control */
    5596                 :          57 :     InitControlFile(sysidentifier, data_checksum_version);
    5597                 :          57 :     ControlFile->time = checkPoint.time;
    5598                 :          57 :     ControlFile->checkPoint = checkPoint.redo;
    5599                 :          57 :     ControlFile->checkPointCopy = checkPoint;
    5600                 :             : 
    5601                 :             :     /* some additional ControlFile fields are set in WriteControlFile() */
    5602                 :          57 :     WriteControlFile();
    5603                 :             : 
    5604                 :             :     /* Bootstrap the commit log, too */
    5605                 :          57 :     BootStrapCLOG();
    5606                 :          57 :     BootStrapCommitTs();
    5607                 :          57 :     BootStrapSUBTRANS();
    5608                 :          57 :     BootStrapMultiXact();
    5609                 :             : 
    5610                 :             :     /*
    5611                 :             :      * Force control file to be read - in contrast to normal processing we'd
    5612                 :             :      * otherwise never run the checks and GUC related initializations therein.
    5613                 :             :      */
    5614                 :          57 :     ReadControlFile();
    5615                 :          57 : }
    5616                 :             : 
    5617                 :             : static char *
    5618                 :         964 : str_time(pg_time_t tnow, char *buf, size_t bufsize)
    5619                 :             : {
    5620                 :         964 :     pg_strftime(buf, bufsize,
    5621                 :             :                 "%Y-%m-%d %H:%M:%S %Z",
    5622                 :         964 :                 pg_localtime(&tnow, log_timezone));
    5623                 :             : 
    5624                 :         964 :     return buf;
    5625                 :             : }
    5626                 :             : 
    5627                 :             : /*
    5628                 :             :  * Initialize the first WAL segment on new timeline.
    5629                 :             :  */
    5630                 :             : static void
    5631                 :          59 : XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
    5632                 :             : {
    5633                 :             :     char        xlogfname[MAXFNAMELEN];
    5634                 :             :     XLogSegNo   endLogSegNo;
    5635                 :             :     XLogSegNo   startLogSegNo;
    5636                 :             : 
    5637                 :             :     /* we always switch to a new timeline after archive recovery */
    5638                 :             :     Assert(endTLI != newTLI);
    5639                 :             : 
    5640                 :             :     /*
    5641                 :             :      * Update min recovery point one last time.
    5642                 :             :      */
    5643                 :          59 :     UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    5644                 :             : 
    5645                 :             :     /*
    5646                 :             :      * Calculate the last segment on the old timeline, and the first segment
    5647                 :             :      * on the new timeline. If the switch happens in the middle of a segment,
    5648                 :             :      * they are the same, but if the switch happens exactly at a segment
    5649                 :             :      * boundary, startLogSegNo will be endLogSegNo + 1.
    5650                 :             :      */
    5651                 :          59 :     XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
    5652                 :          59 :     XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
    5653                 :             : 
    5654                 :             :     /*
    5655                 :             :      * Initialize the starting WAL segment for the new timeline. If the switch
    5656                 :             :      * happens in the middle of a segment, copy data from the last WAL segment
    5657                 :             :      * of the old timeline up to the switch point, to the starting WAL segment
    5658                 :             :      * on the new timeline.
    5659                 :             :      */
    5660         [ +  + ]:          59 :     if (endLogSegNo == startLogSegNo)
    5661                 :             :     {
    5662                 :             :         /*
    5663                 :             :          * Make a copy of the file on the new timeline.
    5664                 :             :          *
    5665                 :             :          * Writing WAL isn't allowed yet, so there are no locking
    5666                 :             :          * considerations. But we should be just as tense as XLogFileInit to
    5667                 :             :          * avoid emplacing a bogus file.
    5668                 :             :          */
    5669                 :          47 :         XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
    5670                 :          47 :                      XLogSegmentOffset(endOfLog, wal_segment_size));
    5671                 :             :     }
    5672                 :             :     else
    5673                 :             :     {
    5674                 :             :         /*
    5675                 :             :          * The switch happened at a segment boundary, so just create the next
    5676                 :             :          * segment on the new timeline.
    5677                 :             :          */
    5678                 :             :         int         fd;
    5679                 :             : 
    5680                 :          12 :         fd = XLogFileInit(startLogSegNo, newTLI);
    5681                 :             : 
    5682         [ -  + ]:          12 :         if (close(fd) != 0)
    5683                 :             :         {
    5684                 :           0 :             int         save_errno = errno;
    5685                 :             : 
    5686                 :           0 :             XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
    5687                 :           0 :             errno = save_errno;
    5688         [ #  # ]:           0 :             ereport(ERROR,
    5689                 :             :                     (errcode_for_file_access(),
    5690                 :             :                      errmsg("could not close file \"%s\": %m", xlogfname)));
    5691                 :             :         }
    5692                 :             :     }
    5693                 :             : 
    5694                 :             :     /*
    5695                 :             :      * Let's just make real sure there are not .ready or .done flags posted
    5696                 :             :      * for the new segment.
    5697                 :             :      */
    5698                 :          59 :     XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
    5699                 :          59 :     XLogArchiveCleanup(xlogfname);
    5700                 :          59 : }
    5701                 :             : 
    5702                 :             : /*
    5703                 :             :  * Perform cleanup actions at the conclusion of archive recovery.
    5704                 :             :  */
    5705                 :             : static void
    5706                 :          59 : CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
    5707                 :             :                             TimeLineID newTLI)
    5708                 :             : {
    5709                 :             :     /*
    5710                 :             :      * Execute the recovery_end_command, if any.
    5711                 :             :      */
    5712   [ +  -  +  + ]:          59 :     if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
    5713                 :           2 :         ExecuteRecoveryCommand(recoveryEndCommand,
    5714                 :             :                                "recovery_end_command",
    5715                 :             :                                true,
    5716                 :             :                                WAIT_EVENT_RECOVERY_END_COMMAND);
    5717                 :             : 
    5718                 :             :     /*
    5719                 :             :      * We switched to a new timeline. Clean up segments on the old timeline.
    5720                 :             :      *
    5721                 :             :      * If there are any higher-numbered segments on the old timeline, remove
    5722                 :             :      * them. They might contain valid WAL, but they might also be
    5723                 :             :      * pre-allocated files containing garbage. In any case, they are not part
    5724                 :             :      * of the new timeline's history so we don't need them.
    5725                 :             :      */
    5726                 :          59 :     RemoveNonParentXlogFiles(EndOfLog, newTLI);
    5727                 :             : 
    5728                 :             :     /*
    5729                 :             :      * If the switch happened in the middle of a segment, what to do with the
    5730                 :             :      * last, partial segment on the old timeline? If we don't archive it, and
    5731                 :             :      * the server that created the WAL never archives it either (e.g. because
    5732                 :             :      * it was hit by a meteor), it will never make it to the archive. That's
    5733                 :             :      * OK from our point of view, because the new segment that we created with
    5734                 :             :      * the new TLI contains all the WAL from the old timeline up to the switch
    5735                 :             :      * point. But if you later try to do PITR to the "missing" WAL on the old
    5736                 :             :      * timeline, recovery won't find it in the archive. It's physically
    5737                 :             :      * present in the new file with new TLI, but recovery won't look there
    5738                 :             :      * when it's recovering to the older timeline. On the other hand, if we
    5739                 :             :      * archive the partial segment, and the original server on that timeline
    5740                 :             :      * is still running and archives the completed version of the same segment
    5741                 :             :      * later, it will fail. (We used to do that in 9.4 and below, and it
    5742                 :             :      * caused such problems).
    5743                 :             :      *
    5744                 :             :      * As a compromise, we rename the last segment with the .partial suffix,
    5745                 :             :      * and archive it. Archive recovery will never try to read .partial
    5746                 :             :      * segments, so they will normally go unused. But in the odd PITR case,
    5747                 :             :      * the administrator can copy them manually to the pg_wal directory
    5748                 :             :      * (removing the suffix). They can be useful in debugging, too.
    5749                 :             :      *
    5750                 :             :      * If a .done or .ready file already exists for the old timeline, however,
    5751                 :             :      * we had already determined that the segment is complete, so we can let
    5752                 :             :      * it be archived normally. (In particular, if it was restored from the
    5753                 :             :      * archive to begin with, it's expected to have a .done file).
    5754                 :             :      */
    5755   [ +  +  +  + ]:          59 :     if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
    5756                 :             :         XLogArchivingActive())
    5757                 :             :     {
    5758                 :             :         char        origfname[MAXFNAMELEN];
    5759                 :             :         XLogSegNo   endLogSegNo;
    5760                 :             : 
    5761                 :          10 :         XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
    5762                 :          10 :         XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
    5763                 :             : 
    5764         [ +  + ]:          10 :         if (!XLogArchiveIsReadyOrDone(origfname))
    5765                 :             :         {
    5766                 :             :             char        origpath[MAXPGPATH];
    5767                 :             :             char        partialfname[MAXFNAMELEN];
    5768                 :             :             char        partialpath[MAXPGPATH];
    5769                 :             : 
    5770                 :             :             /*
    5771                 :             :              * If we're summarizing WAL, we can't rename the partial file
    5772                 :             :              * until the summarizer finishes with it, else it will fail.
    5773                 :             :              */
    5774         [ +  + ]:           6 :             if (summarize_wal)
    5775                 :           1 :                 WaitForWalSummarization(EndOfLog);
    5776                 :             : 
    5777                 :           6 :             XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
    5778                 :           6 :             snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
    5779                 :           6 :             snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
    5780                 :             : 
    5781                 :             :             /*
    5782                 :             :              * Make sure there's no .done or .ready file for the .partial
    5783                 :             :              * file.
    5784                 :             :              */
    5785                 :           6 :             XLogArchiveCleanup(partialfname);
    5786                 :             : 
    5787                 :           6 :             durable_rename(origpath, partialpath, ERROR);
    5788                 :           6 :             XLogArchiveNotify(partialfname);
    5789                 :             :         }
    5790                 :             :     }
    5791                 :          59 : }
    5792                 :             : 
    5793                 :             : /*
    5794                 :             :  * Check to see if required parameters are set high enough on this server
    5795                 :             :  * for various aspects of recovery operation.
    5796                 :             :  *
    5797                 :             :  * Note that all the parameters which this function tests need to be
    5798                 :             :  * listed in Administrator's Overview section in high-availability.sgml.
    5799                 :             :  * If you change them, don't forget to update the list.
    5800                 :             :  */
    5801                 :             : static void
    5802                 :         271 : CheckRequiredParameterValues(void)
    5803                 :             : {
    5804                 :             :     /*
    5805                 :             :      * For archive recovery, the WAL must be generated with at least 'replica'
    5806                 :             :      * wal_level.
    5807                 :             :      */
    5808   [ +  +  +  + ]:         271 :     if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
    5809                 :             :     {
    5810         [ +  - ]:           2 :         ereport(FATAL,
    5811                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    5812                 :             :                  errmsg("WAL was generated with \"wal_level=minimal\", cannot continue recovering"),
    5813                 :             :                  errdetail("This happens if you temporarily set \"wal_level=minimal\" on the server."),
    5814                 :             :                  errhint("Use a backup taken after setting \"wal_level\" to higher than \"minimal\".")));
    5815                 :             :     }
    5816                 :             : 
    5817                 :             :     /*
    5818                 :             :      * For Hot Standby, the WAL must be generated with 'replica' mode, and we
    5819                 :             :      * must have at least as many backend slots as the primary.
    5820                 :             :      */
    5821   [ +  +  +  + ]:         269 :     if (ArchiveRecoveryRequested && EnableHotStandby)
    5822                 :             :     {
    5823                 :             :         /* We ignore autovacuum_worker_slots when we make this test. */
    5824                 :         145 :         RecoveryRequiresIntParameter("max_connections",
    5825                 :             :                                      MaxConnections,
    5826                 :         145 :                                      ControlFile->MaxConnections);
    5827                 :         145 :         RecoveryRequiresIntParameter("max_worker_processes",
    5828                 :             :                                      max_worker_processes,
    5829                 :         145 :                                      ControlFile->max_worker_processes);
    5830                 :         145 :         RecoveryRequiresIntParameter("max_wal_senders",
    5831                 :             :                                      max_wal_senders,
    5832                 :         145 :                                      ControlFile->max_wal_senders);
    5833                 :         145 :         RecoveryRequiresIntParameter("max_prepared_transactions",
    5834                 :             :                                      max_prepared_xacts,
    5835                 :         145 :                                      ControlFile->max_prepared_xacts);
    5836                 :         145 :         RecoveryRequiresIntParameter("max_locks_per_transaction",
    5837                 :             :                                      max_locks_per_xact,
    5838                 :         145 :                                      ControlFile->max_locks_per_xact);
    5839                 :             :     }
    5840                 :         269 : }
    5841                 :             : 
    5842                 :             : /*
    5843                 :             :  * This must be called ONCE during postmaster or standalone-backend startup
    5844                 :             :  */
    5845                 :             : void
    5846                 :        1088 : StartupXLOG(void)
    5847                 :             : {
    5848                 :             :     XLogCtlInsert *Insert;
    5849                 :             :     CheckPoint  checkPoint;
    5850                 :             :     bool        wasShutdown;
    5851                 :             :     bool        didCrash;
    5852                 :             :     bool        haveTblspcMap;
    5853                 :             :     bool        haveBackupLabel;
    5854                 :             :     XLogRecPtr  EndOfLog;
    5855                 :             :     TimeLineID  EndOfLogTLI;
    5856                 :             :     TimeLineID  newTLI;
    5857                 :             :     bool        performedWalRecovery;
    5858                 :             :     EndOfWalRecoveryInfo *endOfRecoveryInfo;
    5859                 :             :     XLogRecPtr  abortedRecPtr;
    5860                 :             :     XLogRecPtr  missingContrecPtr;
    5861                 :             :     TransactionId oldestActiveXID;
    5862                 :        1088 :     bool        promoted = false;
    5863                 :             :     char        timebuf[128];
    5864                 :             : 
    5865                 :             :     /*
    5866                 :             :      * We should have an aux process resource owner to use, and we should not
    5867                 :             :      * be in a transaction that's installed some other resowner.
    5868                 :             :      */
    5869                 :             :     Assert(AuxProcessResourceOwner != NULL);
    5870                 :             :     Assert(CurrentResourceOwner == NULL ||
    5871                 :             :            CurrentResourceOwner == AuxProcessResourceOwner);
    5872                 :        1088 :     CurrentResourceOwner = AuxProcessResourceOwner;
    5873                 :             : 
    5874                 :             :     /*
    5875                 :             :      * Check that contents look valid.
    5876                 :             :      */
    5877         [ -  + ]:        1088 :     if (!XRecOffIsValid(ControlFile->checkPoint))
    5878         [ #  # ]:           0 :         ereport(FATAL,
    5879                 :             :                 (errcode(ERRCODE_DATA_CORRUPTED),
    5880                 :             :                  errmsg("control file contains invalid checkpoint location")));
    5881                 :             : 
    5882   [ +  +  -  -  :        1088 :     switch (ControlFile->state)
                +  +  - ]
    5883                 :             :     {
    5884                 :         860 :         case DB_SHUTDOWNED:
    5885                 :             : 
    5886                 :             :             /*
    5887                 :             :              * This is the expected case, so don't be chatty in standalone
    5888                 :             :              * mode
    5889                 :             :              */
    5890   [ +  +  +  + ]:         860 :             ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    5891                 :             :                     (errmsg("database system was shut down at %s",
    5892                 :             :                             str_time(ControlFile->time,
    5893                 :             :                                      timebuf, sizeof(timebuf)))));
    5894                 :         860 :             break;
    5895                 :             : 
    5896                 :          34 :         case DB_SHUTDOWNED_IN_RECOVERY:
    5897         [ +  - ]:          34 :             ereport(LOG,
    5898                 :             :                     (errmsg("database system was shut down in recovery at %s",
    5899                 :             :                             str_time(ControlFile->time,
    5900                 :             :                                      timebuf, sizeof(timebuf)))));
    5901                 :          34 :             break;
    5902                 :             : 
    5903                 :           0 :         case DB_SHUTDOWNING:
    5904         [ #  # ]:           0 :             ereport(LOG,
    5905                 :             :                     (errmsg("database system shutdown was interrupted; last known up at %s",
    5906                 :             :                             str_time(ControlFile->time,
    5907                 :             :                                      timebuf, sizeof(timebuf)))));
    5908                 :           0 :             break;
    5909                 :             : 
    5910                 :           0 :         case DB_IN_CRASH_RECOVERY:
    5911         [ #  # ]:           0 :             ereport(LOG,
    5912                 :             :                     (errmsg("database system was interrupted while in recovery at %s",
    5913                 :             :                             str_time(ControlFile->time,
    5914                 :             :                                      timebuf, sizeof(timebuf))),
    5915                 :             :                      errhint("This probably means that some data is corrupted and"
    5916                 :             :                              " you will have to use the last backup for recovery.")));
    5917                 :           0 :             break;
    5918                 :             : 
    5919                 :           9 :         case DB_IN_ARCHIVE_RECOVERY:
    5920         [ +  - ]:           9 :             ereport(LOG,
    5921                 :             :                     (errmsg("database system was interrupted while in recovery at log time %s",
    5922                 :             :                             str_time(ControlFile->checkPointCopy.time,
    5923                 :             :                                      timebuf, sizeof(timebuf))),
    5924                 :             :                      errhint("If this has occurred more than once some data might be corrupted"
    5925                 :             :                              " and you might need to choose an earlier recovery target.")));
    5926                 :           9 :             break;
    5927                 :             : 
    5928                 :         185 :         case DB_IN_PRODUCTION:
    5929         [ +  - ]:         185 :             ereport(LOG,
    5930                 :             :                     (errmsg("database system was interrupted; last known up at %s",
    5931                 :             :                             str_time(ControlFile->time,
    5932                 :             :                                      timebuf, sizeof(timebuf)))));
    5933                 :         185 :             break;
    5934                 :             : 
    5935                 :           0 :         default:
    5936         [ #  # ]:           0 :             ereport(FATAL,
    5937                 :             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    5938                 :             :                      errmsg("control file contains invalid database cluster state")));
    5939                 :             :     }
    5940                 :             : 
    5941                 :             :     /* This is just to allow attaching to startup process with a debugger */
    5942                 :             : #ifdef XLOG_REPLAY_DELAY
    5943                 :             :     if (ControlFile->state != DB_SHUTDOWNED)
    5944                 :             :         pg_usleep(60000000L);
    5945                 :             : #endif
    5946                 :             : 
    5947                 :             :     /*
    5948                 :             :      * Verify that pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
    5949                 :             :      * In cases where someone has performed a copy for PITR, these directories
    5950                 :             :      * may have been excluded and need to be re-created.
    5951                 :             :      */
    5952                 :        1088 :     ValidateXLOGDirectoryStructure();
    5953                 :             : 
    5954                 :             :     /* Set up timeout handler needed to report startup progress. */
    5955         [ +  + ]:        1088 :     if (!IsBootstrapProcessingMode())
    5956                 :        1031 :         RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
    5957                 :             :                         startup_progress_timeout_handler);
    5958                 :             : 
    5959                 :             :     /*----------
    5960                 :             :      * If we previously crashed, perform a couple of actions:
    5961                 :             :      *
    5962                 :             :      * - The pg_wal directory may still include some temporary WAL segments
    5963                 :             :      *   used when creating a new segment, so perform some clean up to not
    5964                 :             :      *   bloat this path.  This is done first as there is no point to sync
    5965                 :             :      *   this temporary data.
    5966                 :             :      *
    5967                 :             :      * - There might be data which we had written, intending to fsync it, but
    5968                 :             :      *   which we had not actually fsync'd yet.  Therefore, a power failure in
    5969                 :             :      *   the near future might cause earlier unflushed writes to be lost, even
    5970                 :             :      *   though more recent data written to disk from here on would be
    5971                 :             :      *   persisted.  To avoid that, fsync the entire data directory.
    5972                 :             :      */
    5973         [ +  + ]:        1088 :     if (ControlFile->state != DB_SHUTDOWNED &&
    5974         [ +  + ]:         228 :         ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
    5975                 :             :     {
    5976                 :         194 :         RemoveTempXlogFiles();
    5977                 :         194 :         SyncDataDirectory();
    5978                 :         194 :         didCrash = true;
    5979                 :             :     }
    5980                 :             :     else
    5981                 :         894 :         didCrash = false;
    5982                 :             : 
    5983                 :             :     /*
    5984                 :             :      * Prepare for WAL recovery if needed.
    5985                 :             :      *
    5986                 :             :      * InitWalRecovery analyzes the control file and the backup label file, if
    5987                 :             :      * any.  It updates the in-memory ControlFile buffer according to the
    5988                 :             :      * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
    5989                 :             :      * It also applies the tablespace map file, if any.
    5990                 :             :      */
    5991                 :        1088 :     InitWalRecovery(ControlFile, &wasShutdown,
    5992                 :             :                     &haveBackupLabel, &haveTblspcMap);
    5993                 :        1086 :     checkPoint = ControlFile->checkPointCopy;
    5994                 :             : 
    5995                 :             :     /* initialize shared memory variables from the checkpoint record */
    5996                 :        1086 :     TransamVariables->nextXid = checkPoint.nextXid;
    5997                 :        1086 :     TransamVariables->nextOid = checkPoint.nextOid;
    5998                 :        1086 :     TransamVariables->oidCount = 0;
    5999                 :        1086 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    6000                 :        1086 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    6001                 :        1086 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    6002                 :        1086 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
    6003                 :        1086 :     SetCommitTsLimit(checkPoint.oldestCommitTsXid,
    6004                 :             :                      checkPoint.newestCommitTsXid);
    6005                 :             : 
    6006                 :             :     /*
    6007                 :             :      * Clear out any old relcache cache files.  This is *necessary* if we do
    6008                 :             :      * any WAL replay, since that would probably result in the cache files
    6009                 :             :      * being out of sync with database reality.  In theory we could leave them
    6010                 :             :      * in place if the database had been cleanly shut down, but it seems
    6011                 :             :      * safest to just remove them always and let them be rebuilt during the
    6012                 :             :      * first backend startup.  These files needs to be removed from all
    6013                 :             :      * directories including pg_tblspc, however the symlinks are created only
    6014                 :             :      * after reading tablespace_map file in case of archive recovery from
    6015                 :             :      * backup, so needs to clear old relcache files here after creating
    6016                 :             :      * symlinks.
    6017                 :             :      */
    6018                 :        1086 :     RelationCacheInitFileRemove();
    6019                 :             : 
    6020                 :             :     /*
    6021                 :             :      * Initialize replication slots, before there's a chance to remove
    6022                 :             :      * required resources.
    6023                 :             :      */
    6024                 :        1086 :     StartupReplicationSlots();
    6025                 :             : 
    6026                 :             :     /*
    6027                 :             :      * Startup the logical decoding status with the last status stored in the
    6028                 :             :      * checkpoint record.
    6029                 :             :      */
    6030                 :        1084 :     StartupLogicalDecodingStatus(checkPoint.logicalDecodingEnabled);
    6031                 :             : 
    6032                 :             :     /*
    6033                 :             :      * Startup logical state, needs to be setup now so we have proper data
    6034                 :             :      * during crash recovery.
    6035                 :             :      */
    6036                 :        1084 :     StartupReorderBuffer();
    6037                 :             : 
    6038                 :             :     /*
    6039                 :             :      * Startup CLOG. This must be done after TransamVariables->nextXid has
    6040                 :             :      * been initialized and before we accept connections or begin WAL replay.
    6041                 :             :      */
    6042                 :        1084 :     StartupCLOG();
    6043                 :             : 
    6044                 :             :     /*
    6045                 :             :      * Startup MultiXact. We need to do this early to be able to replay
    6046                 :             :      * truncations.
    6047                 :             :      */
    6048                 :        1084 :     StartupMultiXact();
    6049                 :             : 
    6050                 :             :     /*
    6051                 :             :      * Ditto for commit timestamps.  Activate the facility if the setting is
    6052                 :             :      * enabled in the control file, as there should be no tracking of commit
    6053                 :             :      * timestamps done when the setting was disabled.  This facility can be
    6054                 :             :      * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
    6055                 :             :      */
    6056         [ +  + ]:        1084 :     if (ControlFile->track_commit_timestamp)
    6057                 :          14 :         StartupCommitTs();
    6058                 :             : 
    6059                 :             :     /*
    6060                 :             :      * Recover knowledge about replay progress of known replication partners.
    6061                 :             :      */
    6062                 :        1084 :     StartupReplicationOrigin();
    6063                 :             : 
    6064                 :             :     /*
    6065                 :             :      * Initialize unlogged LSN. On a clean shutdown, it's restored from the
    6066                 :             :      * control file. On recovery, all unlogged relations are blown away, so
    6067                 :             :      * the unlogged LSN counter can be reset too.
    6068                 :             :      */
    6069         [ +  + ]:        1084 :     if (ControlFile->state == DB_SHUTDOWNED)
    6070                 :         851 :         pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
    6071                 :         851 :                                        ControlFile->unloggedLSN);
    6072                 :             :     else
    6073                 :         233 :         pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
    6074                 :             :                                        FirstNormalUnloggedLSN);
    6075                 :             : 
    6076                 :             :     /*
    6077                 :             :      * Copy any missing timeline history files between 'now' and the recovery
    6078                 :             :      * target timeline from archive to pg_wal. While we don't need those files
    6079                 :             :      * ourselves - the history file of the recovery target timeline covers all
    6080                 :             :      * the previous timelines in the history too - a cascading standby server
    6081                 :             :      * might be interested in them. Or, if you archive the WAL from this
    6082                 :             :      * server to a different archive than the primary, it'd be good for all
    6083                 :             :      * the history files to get archived there after failover, so that you can
    6084                 :             :      * use one of the old timelines as a PITR target. Timeline history files
    6085                 :             :      * are small, so it's better to copy them unnecessarily than not copy them
    6086                 :             :      * and regret later.
    6087                 :             :      */
    6088                 :        1084 :     restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
    6089                 :             : 
    6090                 :             :     /*
    6091                 :             :      * Before running in recovery, scan pg_twophase and fill in its status to
    6092                 :             :      * be able to work on entries generated by redo.  Doing a scan before
    6093                 :             :      * taking any recovery action has the merit to discard any 2PC files that
    6094                 :             :      * are newer than the first record to replay, saving from any conflicts at
    6095                 :             :      * replay.  This avoids as well any subsequent scans when doing recovery
    6096                 :             :      * of the on-disk two-phase data.
    6097                 :             :      */
    6098                 :        1084 :     restoreTwoPhaseData();
    6099                 :             : 
    6100                 :             :     /*
    6101                 :             :      * When starting with crash recovery, reset pgstat data - it might not be
    6102                 :             :      * valid. Otherwise restore pgstat data. It's safe to do this here,
    6103                 :             :      * because postmaster will not yet have started any other processes.
    6104                 :             :      *
    6105                 :             :      * NB: Restoring replication slot stats relies on slot state to have
    6106                 :             :      * already been restored from disk.
    6107                 :             :      *
    6108                 :             :      * TODO: With a bit of extra work we could just start with a pgstat file
    6109                 :             :      * associated with the checkpoint redo location we're starting from.
    6110                 :             :      */
    6111         [ +  + ]:        1084 :     if (didCrash)
    6112                 :         192 :         pgstat_discard_stats();
    6113                 :             :     else
    6114                 :         892 :         pgstat_restore_stats();
    6115                 :             : 
    6116                 :        1084 :     lastFullPageWrites = checkPoint.fullPageWrites;
    6117                 :             : 
    6118                 :        1084 :     RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    6119                 :        1084 :     doPageWrites = lastFullPageWrites;
    6120                 :             : 
    6121                 :             :     /* REDO */
    6122         [ +  + ]:        1084 :     if (InRecovery)
    6123                 :             :     {
    6124                 :             :         /* Initialize state for RecoveryInProgress() */
    6125                 :         233 :         SpinLockAcquire(&XLogCtl->info_lck);
    6126         [ +  + ]:         233 :         if (InArchiveRecovery)
    6127                 :         131 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
    6128                 :             :         else
    6129                 :         102 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
    6130                 :         233 :         SpinLockRelease(&XLogCtl->info_lck);
    6131                 :             : 
    6132                 :             :         /*
    6133                 :             :          * Update pg_control to show that we are recovering and to show the
    6134                 :             :          * selected checkpoint as the place we are starting from. We also mark
    6135                 :             :          * pg_control with any minimum recovery stop point obtained from a
    6136                 :             :          * backup history file.
    6137                 :             :          *
    6138                 :             :          * No need to hold ControlFileLock yet, we aren't up far enough.
    6139                 :             :          */
    6140                 :         233 :         UpdateControlFile();
    6141                 :             : 
    6142                 :             :         /*
    6143                 :             :          * If there was a backup label file, it's done its job and the info
    6144                 :             :          * has now been propagated into pg_control.  We must get rid of the
    6145                 :             :          * label file so that if we crash during recovery, we'll pick up at
    6146                 :             :          * the latest recovery restartpoint instead of going all the way back
    6147                 :             :          * to the backup start point.  It seems prudent though to just rename
    6148                 :             :          * the file out of the way rather than delete it completely.
    6149                 :             :          */
    6150         [ +  + ]:         233 :         if (haveBackupLabel)
    6151                 :             :         {
    6152                 :          88 :             unlink(BACKUP_LABEL_OLD);
    6153                 :          88 :             durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
    6154                 :             :         }
    6155                 :             : 
    6156                 :             :         /*
    6157                 :             :          * If there was a tablespace_map file, it's done its job and the
    6158                 :             :          * symlinks have been created.  We must get rid of the map file so
    6159                 :             :          * that if we crash during recovery, we don't create symlinks again.
    6160                 :             :          * It seems prudent though to just rename the file out of the way
    6161                 :             :          * rather than delete it completely.
    6162                 :             :          */
    6163         [ +  + ]:         233 :         if (haveTblspcMap)
    6164                 :             :         {
    6165                 :           2 :             unlink(TABLESPACE_MAP_OLD);
    6166                 :           2 :             durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
    6167                 :             :         }
    6168                 :             : 
    6169                 :             :         /*
    6170                 :             :          * Initialize our local copy of minRecoveryPoint.  When doing crash
    6171                 :             :          * recovery we want to replay up to the end of WAL.  Particularly, in
    6172                 :             :          * the case of a promoted standby minRecoveryPoint value in the
    6173                 :             :          * control file is only updated after the first checkpoint.  However,
    6174                 :             :          * if the instance crashes before the first post-recovery checkpoint
    6175                 :             :          * is completed then recovery will use a stale location causing the
    6176                 :             :          * startup process to think that there are still invalid page
    6177                 :             :          * references when checking for data consistency.
    6178                 :             :          */
    6179         [ +  + ]:         233 :         if (InArchiveRecovery)
    6180                 :             :         {
    6181                 :         131 :             LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    6182                 :         131 :             LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    6183                 :             :         }
    6184                 :             :         else
    6185                 :             :         {
    6186                 :         102 :             LocalMinRecoveryPoint = InvalidXLogRecPtr;
    6187                 :         102 :             LocalMinRecoveryPointTLI = 0;
    6188                 :             :         }
    6189                 :             : 
    6190                 :             :         /* Check that the GUCs used to generate the WAL allow recovery */
    6191                 :         233 :         CheckRequiredParameterValues();
    6192                 :             : 
    6193                 :             :         /*
    6194                 :             :          * We're in recovery, so unlogged relations may be trashed and must be
    6195                 :             :          * reset.  This should be done BEFORE allowing Hot Standby
    6196                 :             :          * connections, so that read-only backends don't try to read whatever
    6197                 :             :          * garbage is left over from before.
    6198                 :             :          */
    6199                 :         233 :         ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
    6200                 :             : 
    6201                 :             :         /*
    6202                 :             :          * Likewise, delete any saved transaction snapshot files that got left
    6203                 :             :          * behind by crashed backends.
    6204                 :             :          */
    6205                 :         233 :         DeleteAllExportedSnapshotFiles();
    6206                 :             : 
    6207                 :             :         /*
    6208                 :             :          * Initialize for Hot Standby, if enabled. We won't let backends in
    6209                 :             :          * yet, not until we've reached the min recovery point specified in
    6210                 :             :          * control file and we've established a recovery snapshot from a
    6211                 :             :          * running-xacts WAL record.
    6212                 :             :          */
    6213   [ +  +  +  + ]:         233 :         if (ArchiveRecoveryRequested && EnableHotStandby)
    6214                 :             :         {
    6215                 :             :             TransactionId *xids;
    6216                 :             :             int         nxids;
    6217                 :             : 
    6218         [ +  + ]:         123 :             ereport(DEBUG1,
    6219                 :             :                     (errmsg_internal("initializing for hot standby")));
    6220                 :             : 
    6221                 :         123 :             InitRecoveryTransactionEnvironment();
    6222                 :             : 
    6223         [ +  + ]:         123 :             if (wasShutdown)
    6224                 :          26 :                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    6225                 :             :             else
    6226                 :          97 :                 oldestActiveXID = checkPoint.oldestActiveXid;
    6227                 :             :             Assert(TransactionIdIsValid(oldestActiveXID));
    6228                 :             : 
    6229                 :             :             /* Tell procarray about the range of xids it has to deal with */
    6230                 :         123 :             ProcArrayInitRecovery(XidFromFullTransactionId(TransamVariables->nextXid));
    6231                 :             : 
    6232                 :             :             /*
    6233                 :             :              * Startup subtrans only.  CLOG, MultiXact and commit timestamp
    6234                 :             :              * have already been started up and other SLRUs are not maintained
    6235                 :             :              * during recovery and need not be started yet.
    6236                 :             :              */
    6237                 :         123 :             StartupSUBTRANS(oldestActiveXID);
    6238                 :             : 
    6239                 :             :             /*
    6240                 :             :              * If we're beginning at a shutdown checkpoint, we know that
    6241                 :             :              * nothing was running on the primary at this point. So fake-up an
    6242                 :             :              * empty running-xacts record and use that here and now. Recover
    6243                 :             :              * additional standby state for prepared transactions.
    6244                 :             :              */
    6245         [ +  + ]:         123 :             if (wasShutdown)
    6246                 :             :             {
    6247                 :             :                 RunningTransactionsData running;
    6248                 :             :                 TransactionId latestCompletedXid;
    6249                 :             : 
    6250                 :             :                 /* Update pg_subtrans entries for any prepared transactions */
    6251                 :          26 :                 StandbyRecoverPreparedTransactions();
    6252                 :             : 
    6253                 :             :                 /*
    6254                 :             :                  * Construct a RunningTransactions snapshot representing a
    6255                 :             :                  * shut down server, with only prepared transactions still
    6256                 :             :                  * alive. We're never overflowed at this point because all
    6257                 :             :                  * subxids are listed with their parent prepared transactions.
    6258                 :             :                  */
    6259                 :          26 :                 running.xcnt = nxids;
    6260                 :          26 :                 running.subxcnt = 0;
    6261                 :          26 :                 running.subxid_status = SUBXIDS_IN_SUBTRANS;
    6262                 :          26 :                 running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
    6263                 :          26 :                 running.oldestRunningXid = oldestActiveXID;
    6264                 :          26 :                 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
    6265         [ -  + ]:          26 :                 TransactionIdRetreat(latestCompletedXid);
    6266                 :             :                 Assert(TransactionIdIsNormal(latestCompletedXid));
    6267                 :          26 :                 running.latestCompletedXid = latestCompletedXid;
    6268                 :          26 :                 running.xids = xids;
    6269                 :             : 
    6270                 :          26 :                 ProcArrayApplyRecoveryInfo(&running);
    6271                 :             :             }
    6272                 :             :         }
    6273                 :             : 
    6274                 :             :         /*
    6275                 :             :          * We're all set for replaying the WAL now. Do it.
    6276                 :             :          */
    6277                 :         233 :         PerformWalRecovery();
    6278                 :         167 :         performedWalRecovery = true;
    6279                 :             :     }
    6280                 :             :     else
    6281                 :         851 :         performedWalRecovery = false;
    6282                 :             : 
    6283                 :             :     /*
    6284                 :             :      * Finish WAL recovery.
    6285                 :             :      */
    6286                 :        1018 :     endOfRecoveryInfo = FinishWalRecovery();
    6287                 :        1018 :     EndOfLog = endOfRecoveryInfo->endOfLog;
    6288                 :        1018 :     EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
    6289                 :        1018 :     abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
    6290                 :        1018 :     missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
    6291                 :             : 
    6292                 :             :     /*
    6293                 :             :      * Reset ps status display, so as no information related to recovery shows
    6294                 :             :      * up.
    6295                 :             :      */
    6296                 :        1018 :     set_ps_display("");
    6297                 :             : 
    6298                 :             :     /*
    6299                 :             :      * When recovering from a backup (we are in recovery, and archive recovery
    6300                 :             :      * was requested), complain if we did not roll forward far enough to reach
    6301                 :             :      * the point where the database is consistent.  For regular online
    6302                 :             :      * backup-from-primary, that means reaching the end-of-backup WAL record
    6303                 :             :      * (at which point we reset backupStartPoint to be Invalid), for
    6304                 :             :      * backup-from-replica (which can't inject records into the WAL stream),
    6305                 :             :      * that point is when we reach the minRecoveryPoint in pg_control (which
    6306                 :             :      * we purposefully copy last when backing up from a replica).  For
    6307                 :             :      * pg_rewind (which creates a backup_label with a method of "pg_rewind")
    6308                 :             :      * or snapshot-style backups (which don't), backupEndRequired will be set
    6309                 :             :      * to false.
    6310                 :             :      *
    6311                 :             :      * Note: it is indeed okay to look at the local variable
    6312                 :             :      * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
    6313                 :             :      * might be further ahead --- ControlFile->minRecoveryPoint cannot have
    6314                 :             :      * been advanced beyond the WAL we processed.
    6315                 :             :      */
    6316         [ +  + ]:        1018 :     if (InRecovery &&
    6317         [ +  - ]:         167 :         (EndOfLog < LocalMinRecoveryPoint ||
    6318         [ -  + ]:         167 :          XLogRecPtrIsValid(ControlFile->backupStartPoint)))
    6319                 :             :     {
    6320                 :             :         /*
    6321                 :             :          * Ran off end of WAL before reaching end-of-backup WAL record, or
    6322                 :             :          * minRecoveryPoint. That's a bad sign, indicating that you tried to
    6323                 :             :          * recover from an online backup but never called pg_backup_stop(), or
    6324                 :             :          * you didn't archive all the WAL needed.
    6325                 :             :          */
    6326   [ #  #  #  # ]:           0 :         if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
    6327                 :             :         {
    6328   [ #  #  #  # ]:           0 :             if (XLogRecPtrIsValid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired)
    6329         [ #  # ]:           0 :                 ereport(FATAL,
    6330                 :             :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6331                 :             :                          errmsg("WAL ends before end of online backup"),
    6332                 :             :                          errhint("All WAL generated while online backup was taken must be available at recovery.")));
    6333                 :             :             else
    6334         [ #  # ]:           0 :                 ereport(FATAL,
    6335                 :             :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6336                 :             :                          errmsg("WAL ends before consistent recovery point")));
    6337                 :             :         }
    6338                 :             :     }
    6339                 :             : 
    6340                 :             :     /*
    6341                 :             :      * Reset unlogged relations to the contents of their INIT fork. This is
    6342                 :             :      * done AFTER recovery is complete so as to include any unlogged relations
    6343                 :             :      * created during recovery, but BEFORE recovery is marked as having
    6344                 :             :      * completed successfully. Otherwise we'd not retry if any of the post
    6345                 :             :      * end-of-recovery steps fail.
    6346                 :             :      */
    6347         [ +  + ]:        1018 :     if (InRecovery)
    6348                 :         167 :         ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
    6349                 :             : 
    6350                 :             :     /*
    6351                 :             :      * Pre-scan prepared transactions to find out the range of XIDs present.
    6352                 :             :      * This information is not quite needed yet, but it is positioned here so
    6353                 :             :      * as potential problems are detected before any on-disk change is done.
    6354                 :             :      */
    6355                 :        1018 :     oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
    6356                 :             : 
    6357                 :             :     /*
    6358                 :             :      * Allow ordinary WAL segment creation before possibly switching to a new
    6359                 :             :      * timeline, which creates a new segment, and after the last ReadRecord().
    6360                 :             :      */
    6361                 :        1018 :     SetInstallXLogFileSegmentActive();
    6362                 :             : 
    6363                 :             :     /*
    6364                 :             :      * Consider whether we need to assign a new timeline ID.
    6365                 :             :      *
    6366                 :             :      * If we did archive recovery, we always assign a new ID.  This handles a
    6367                 :             :      * couple of issues.  If we stopped short of the end of WAL during
    6368                 :             :      * recovery, then we are clearly generating a new timeline and must assign
    6369                 :             :      * it a unique new ID.  Even if we ran to the end, modifying the current
    6370                 :             :      * last segment is problematic because it may result in trying to
    6371                 :             :      * overwrite an already-archived copy of that segment, and we encourage
    6372                 :             :      * DBAs to make their archive_commands reject that.  We can dodge the
    6373                 :             :      * problem by making the new active segment have a new timeline ID.
    6374                 :             :      *
    6375                 :             :      * In a normal crash recovery, we can just extend the timeline we were in.
    6376                 :             :      */
    6377                 :        1018 :     newTLI = endOfRecoveryInfo->lastRecTLI;
    6378         [ +  + ]:        1018 :     if (ArchiveRecoveryRequested)
    6379                 :             :     {
    6380                 :          59 :         newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
    6381         [ +  - ]:          59 :         ereport(LOG,
    6382                 :             :                 (errmsg("selected new timeline ID: %u", newTLI)));
    6383                 :             : 
    6384                 :             :         /*
    6385                 :             :          * Make a writable copy of the last WAL segment.  (Note that we also
    6386                 :             :          * have a copy of the last block of the old WAL in
    6387                 :             :          * endOfRecovery->lastPage; we will use that below.)
    6388                 :             :          */
    6389                 :          59 :         XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
    6390                 :             : 
    6391                 :             :         /*
    6392                 :             :          * Remove the signal files out of the way, so that we don't
    6393                 :             :          * accidentally re-enter archive recovery mode in a subsequent crash.
    6394                 :             :          */
    6395         [ +  + ]:          59 :         if (endOfRecoveryInfo->standby_signal_file_found)
    6396                 :          56 :             durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
    6397                 :             : 
    6398         [ +  + ]:          59 :         if (endOfRecoveryInfo->recovery_signal_file_found)
    6399                 :           4 :             durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
    6400                 :             : 
    6401                 :             :         /*
    6402                 :             :          * Write the timeline history file, and have it archived. After this
    6403                 :             :          * point (or rather, as soon as the file is archived), the timeline
    6404                 :             :          * will appear as "taken" in the WAL archive and to any standby
    6405                 :             :          * servers.  If we crash before actually switching to the new
    6406                 :             :          * timeline, standby servers will nevertheless think that we switched
    6407                 :             :          * to the new timeline, and will try to connect to the new timeline.
    6408                 :             :          * To minimize the window for that, try to do as little as possible
    6409                 :             :          * between here and writing the end-of-recovery record.
    6410                 :             :          */
    6411                 :          59 :         writeTimeLineHistory(newTLI, recoveryTargetTLI,
    6412                 :             :                              EndOfLog, endOfRecoveryInfo->recoveryStopReason);
    6413                 :             : 
    6414         [ +  - ]:          59 :         ereport(LOG,
    6415                 :             :                 (errmsg("archive recovery complete")));
    6416                 :             :     }
    6417                 :             : 
    6418                 :             :     /* Save the selected TimeLineID in shared memory, too */
    6419                 :        1018 :     SpinLockAcquire(&XLogCtl->info_lck);
    6420                 :        1018 :     XLogCtl->InsertTimeLineID = newTLI;
    6421                 :        1018 :     XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
    6422                 :        1018 :     SpinLockRelease(&XLogCtl->info_lck);
    6423                 :             : 
    6424                 :             :     /*
    6425                 :             :      * Actually, if WAL ended in an incomplete record, skip the parts that
    6426                 :             :      * made it through and start writing after the portion that persisted.
    6427                 :             :      * (It's critical to first write an OVERWRITE_CONTRECORD message, which
    6428                 :             :      * we'll do as soon as we're open for writing new WAL.)
    6429                 :             :      */
    6430         [ +  + ]:        1018 :     if (XLogRecPtrIsValid(missingContrecPtr))
    6431                 :             :     {
    6432                 :             :         /*
    6433                 :             :          * We should only have a missingContrecPtr if we're not switching to a
    6434                 :             :          * new timeline. When a timeline switch occurs, WAL is copied from the
    6435                 :             :          * old timeline to the new only up to the end of the last complete
    6436                 :             :          * record, so there can't be an incomplete WAL record that we need to
    6437                 :             :          * disregard.
    6438                 :             :          */
    6439                 :             :         Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
    6440                 :             :         Assert(XLogRecPtrIsValid(abortedRecPtr));
    6441                 :          11 :         EndOfLog = missingContrecPtr;
    6442                 :             :     }
    6443                 :             : 
    6444                 :             :     /*
    6445                 :             :      * Prepare to write WAL starting at EndOfLog location, and init xlog
    6446                 :             :      * buffer cache using the block containing the last record from the
    6447                 :             :      * previous incarnation.
    6448                 :             :      */
    6449                 :        1018 :     Insert = &XLogCtl->Insert;
    6450                 :        1018 :     Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
    6451                 :        1018 :     Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
    6452                 :             : 
    6453                 :             :     /*
    6454                 :             :      * Tricky point here: lastPage contains the *last* block that the LastRec
    6455                 :             :      * record spans, not the one it starts in.  The last block is indeed the
    6456                 :             :      * one we want to use.
    6457                 :             :      */
    6458         [ +  + ]:        1018 :     if (EndOfLog % XLOG_BLCKSZ != 0)
    6459                 :             :     {
    6460                 :             :         char       *page;
    6461                 :             :         int         len;
    6462                 :             :         int         firstIdx;
    6463                 :             : 
    6464                 :         985 :         firstIdx = XLogRecPtrToBufIdx(EndOfLog);
    6465                 :         985 :         len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
    6466                 :             :         Assert(len < XLOG_BLCKSZ);
    6467                 :             : 
    6468                 :             :         /* Copy the valid part of the last block, and zero the rest */
    6469                 :         985 :         page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
    6470                 :         985 :         memcpy(page, endOfRecoveryInfo->lastPage, len);
    6471                 :         985 :         memset(page + len, 0, XLOG_BLCKSZ - len);
    6472                 :             : 
    6473                 :         985 :         pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
    6474                 :         985 :         XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
    6475                 :             :     }
    6476                 :             :     else
    6477                 :             :     {
    6478                 :             :         /*
    6479                 :             :          * There is no partial block to copy. Just set InitializedUpTo, and
    6480                 :             :          * let the first attempt to insert a log record to initialize the next
    6481                 :             :          * buffer.
    6482                 :             :          */
    6483                 :          33 :         XLogCtl->InitializedUpTo = EndOfLog;
    6484                 :             :     }
    6485                 :             : 
    6486                 :             :     /*
    6487                 :             :      * Update local and shared status.  This is OK to do without any locks
    6488                 :             :      * because no other process can be reading or writing WAL yet.
    6489                 :             :      */
    6490                 :        1018 :     LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
    6491                 :        1018 :     pg_atomic_write_u64(&XLogCtl->logInsertResult, EndOfLog);
    6492                 :        1018 :     pg_atomic_write_u64(&XLogCtl->logWriteResult, EndOfLog);
    6493                 :        1018 :     pg_atomic_write_u64(&XLogCtl->logFlushResult, EndOfLog);
    6494                 :        1018 :     XLogCtl->LogwrtRqst.Write = EndOfLog;
    6495                 :        1018 :     XLogCtl->LogwrtRqst.Flush = EndOfLog;
    6496                 :             : 
    6497                 :             :     /*
    6498                 :             :      * Preallocate additional log files, if wanted.
    6499                 :             :      */
    6500                 :        1018 :     PreallocXlogFiles(EndOfLog, newTLI);
    6501                 :             : 
    6502                 :             :     /*
    6503                 :             :      * Okay, we're officially UP.
    6504                 :             :      */
    6505                 :        1018 :     InRecovery = false;
    6506                 :             : 
    6507                 :             :     /* start the archive_timeout timer and LSN running */
    6508                 :        1018 :     XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    6509                 :        1018 :     XLogCtl->lastSegSwitchLSN = EndOfLog;
    6510                 :             : 
    6511                 :             :     /* also initialize latestCompletedXid, to nextXid - 1 */
    6512                 :        1018 :     LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
    6513                 :        1018 :     TransamVariables->latestCompletedXid = TransamVariables->nextXid;
    6514                 :        1018 :     FullTransactionIdRetreat(&TransamVariables->latestCompletedXid);
    6515                 :        1018 :     LWLockRelease(ProcArrayLock);
    6516                 :             : 
    6517                 :             :     /*
    6518                 :             :      * Start up subtrans, if not already done for hot standby.  (commit
    6519                 :             :      * timestamps are started below, if necessary.)
    6520                 :             :      */
    6521         [ +  + ]:        1018 :     if (standbyState == STANDBY_DISABLED)
    6522                 :         959 :         StartupSUBTRANS(oldestActiveXID);
    6523                 :             : 
    6524                 :             :     /*
    6525                 :             :      * Perform end of recovery actions for any SLRUs that need it.
    6526                 :             :      */
    6527                 :        1018 :     TrimCLOG();
    6528                 :        1018 :     TrimMultiXact();
    6529                 :             : 
    6530                 :             :     /*
    6531                 :             :      * Reload shared-memory state for prepared transactions.  This needs to
    6532                 :             :      * happen before renaming the last partial segment of the old timeline as
    6533                 :             :      * it may be possible that we have to recover some transactions from it.
    6534                 :             :      */
    6535                 :        1018 :     RecoverPreparedTransactions();
    6536                 :             : 
    6537                 :             :     /* Shut down xlogreader */
    6538                 :        1018 :     ShutdownWalRecovery();
    6539                 :             : 
    6540                 :             :     /* Enable WAL writes for this backend only. */
    6541                 :        1018 :     LocalSetXLogInsertAllowed();
    6542                 :             : 
    6543                 :             :     /* If necessary, write overwrite-contrecord before doing anything else */
    6544         [ +  + ]:        1018 :     if (XLogRecPtrIsValid(abortedRecPtr))
    6545                 :             :     {
    6546                 :             :         Assert(XLogRecPtrIsValid(missingContrecPtr));
    6547                 :          11 :         CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
    6548                 :             :     }
    6549                 :             : 
    6550                 :             :     /*
    6551                 :             :      * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
    6552                 :             :      * record before resource manager writes cleanup WAL records or checkpoint
    6553                 :             :      * record is written.
    6554                 :             :      */
    6555                 :        1018 :     Insert->fullPageWrites = lastFullPageWrites;
    6556                 :        1018 :     UpdateFullPageWrites();
    6557                 :             : 
    6558                 :             :     /*
    6559                 :             :      * Emit checkpoint or end-of-recovery record in XLOG, if required.
    6560                 :             :      */
    6561         [ +  + ]:        1018 :     if (performedWalRecovery)
    6562                 :         167 :         promoted = PerformRecoveryXLogAction();
    6563                 :             : 
    6564                 :             :     /*
    6565                 :             :      * If any of the critical GUCs have changed, log them before we allow
    6566                 :             :      * backends to write WAL.
    6567                 :             :      */
    6568                 :        1018 :     XLogReportParameters();
    6569                 :             : 
    6570                 :             :     /* If this is archive recovery, perform post-recovery cleanup actions. */
    6571         [ +  + ]:        1018 :     if (ArchiveRecoveryRequested)
    6572                 :          59 :         CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
    6573                 :             : 
    6574                 :        1018 :     INJECTION_POINT("promotion-after-wal-segment-cleanup", NULL);
    6575                 :             : 
    6576                 :             :     /*
    6577                 :             :      * Local WAL inserts enabled, so it's time to finish initialization of
    6578                 :             :      * commit timestamp.
    6579                 :             :      */
    6580                 :        1018 :     CompleteCommitTsInitialization();
    6581                 :             : 
    6582                 :             :     /*
    6583                 :             :      * Update logical decoding status in shared memory and write an
    6584                 :             :      * XLOG_LOGICAL_DECODING_STATUS_CHANGE, if necessary.
    6585                 :             :      */
    6586                 :        1018 :     UpdateLogicalDecodingStatusEndOfRecovery();
    6587                 :             : 
    6588                 :             :     /* Clean up EndOfWalRecoveryInfo data to appease Valgrind leak checking */
    6589         [ +  + ]:        1018 :     if (endOfRecoveryInfo->lastPage)
    6590                 :         996 :         pfree(endOfRecoveryInfo->lastPage);
    6591                 :        1018 :     pfree(endOfRecoveryInfo->recoveryStopReason);
    6592                 :        1018 :     pfree(endOfRecoveryInfo);
    6593                 :             : 
    6594                 :             :     /*
    6595                 :             :      * If we reach this point with checksums in the state inprogress-on, it
    6596                 :             :      * means that data checksums were in the process of being enabled when the
    6597                 :             :      * cluster shut down. Since processing didn't finish, the operation will
    6598                 :             :      * have to be restarted from scratch since there is no capability to
    6599                 :             :      * continue where it was when the cluster shut down. Thus, revert the
    6600                 :             :      * state back to off, and inform the user with a warning message. Being
    6601                 :             :      * able to restart processing is a TODO, but it wouldn't be possible to
    6602                 :             :      * restart here since we cannot launch a dynamic background worker
    6603                 :             :      * directly from here (it has to be from a regular backend).
    6604                 :             :      */
    6605         [ +  + ]:        1018 :     if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON)
    6606                 :             :     {
    6607                 :           1 :         XLogChecksums(PG_DATA_CHECKSUM_OFF);
    6608                 :             : 
    6609                 :           1 :         SpinLockAcquire(&XLogCtl->info_lck);
    6610                 :           1 :         XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_OFF;
    6611                 :           1 :         SetLocalDataChecksumState(XLogCtl->data_checksum_version);
    6612                 :           1 :         SpinLockRelease(&XLogCtl->info_lck);
    6613                 :             : 
    6614                 :           1 :         EmitAndWaitDataChecksumsBarrier(PG_DATA_CHECKSUM_OFF);
    6615         [ +  - ]:           1 :         ereport(WARNING,
    6616                 :             :                 errmsg("enabling data checksums was interrupted"),
    6617                 :             :                 errhint("Data checksum processing must be manually restarted for checksums to be enabled."));
    6618                 :             :     }
    6619                 :             : 
    6620                 :             :     /*
    6621                 :             :      * If data checksums were being disabled when the cluster was shut down,
    6622                 :             :      * we know that we have a state where all backends have stopped validating
    6623                 :             :      * checksums and we can move to off instead of prompting the user to
    6624                 :             :      * perform any action.
    6625                 :             :      */
    6626         [ -  + ]:        1017 :     else if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF)
    6627                 :             :     {
    6628                 :           0 :         XLogChecksums(PG_DATA_CHECKSUM_OFF);
    6629                 :             : 
    6630                 :           0 :         SpinLockAcquire(&XLogCtl->info_lck);
    6631                 :           0 :         XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_OFF;
    6632                 :           0 :         SetLocalDataChecksumState(XLogCtl->data_checksum_version);
    6633                 :           0 :         SpinLockRelease(&XLogCtl->info_lck);
    6634                 :             : 
    6635                 :           0 :         EmitAndWaitDataChecksumsBarrier(PG_DATA_CHECKSUM_OFF);
    6636                 :             :     }
    6637                 :             : 
    6638                 :             :     /*
    6639                 :             :      * All done with end-of-recovery actions.
    6640                 :             :      *
    6641                 :             :      * Now allow backends to write WAL and update the control file status in
    6642                 :             :      * consequence.  SharedRecoveryState, that controls if backends can write
    6643                 :             :      * WAL, is updated while holding ControlFileLock to prevent other backends
    6644                 :             :      * to look at an inconsistent state of the control file in shared memory.
    6645                 :             :      * There is still a small window during which backends can write WAL and
    6646                 :             :      * the control file is still referring to a system not in DB_IN_PRODUCTION
    6647                 :             :      * state while looking at the on-disk control file.
    6648                 :             :      *
    6649                 :             :      * Also, we use info_lck to update SharedRecoveryState to ensure that
    6650                 :             :      * there are no race conditions concerning visibility of other recent
    6651                 :             :      * updates to shared memory.
    6652                 :             :      */
    6653                 :        1018 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6654                 :        1018 :     ControlFile->state = DB_IN_PRODUCTION;
    6655                 :             : 
    6656                 :        1018 :     SpinLockAcquire(&XLogCtl->info_lck);
    6657                 :        1018 :     ControlFile->data_checksum_version = XLogCtl->data_checksum_version;
    6658                 :        1018 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
    6659                 :        1018 :     SpinLockRelease(&XLogCtl->info_lck);
    6660                 :             : 
    6661                 :        1018 :     UpdateControlFile();
    6662                 :        1018 :     LWLockRelease(ControlFileLock);
    6663                 :             : 
    6664                 :             :     /*
    6665                 :             :      * Wake up the checkpointer process as there might be a request to disable
    6666                 :             :      * logical decoding by concurrent slot drop.
    6667                 :             :      */
    6668                 :        1018 :     WakeupCheckpointer();
    6669                 :             : 
    6670                 :             :     /*
    6671                 :             :      * Wake up all waiters.  They need to report an error that recovery was
    6672                 :             :      * ended before reaching the target LSN.
    6673                 :             :      */
    6674                 :        1018 :     WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_REPLAY, InvalidXLogRecPtr);
    6675                 :        1018 :     WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_WRITE, InvalidXLogRecPtr);
    6676                 :        1018 :     WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_FLUSH, InvalidXLogRecPtr);
    6677                 :             : 
    6678                 :             :     /*
    6679                 :             :      * Shutdown the recovery environment.  This must occur after
    6680                 :             :      * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
    6681                 :             :      * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
    6682                 :             :      * any session building a snapshot will not rely on KnownAssignedXids as
    6683                 :             :      * RecoveryInProgress() would return false at this stage.  This is
    6684                 :             :      * particularly critical for prepared 2PC transactions, that would still
    6685                 :             :      * need to be included in snapshots once recovery has ended.
    6686                 :             :      */
    6687         [ +  + ]:        1018 :     if (standbyState != STANDBY_DISABLED)
    6688                 :          59 :         ShutdownRecoveryTransactionEnvironment();
    6689                 :             : 
    6690                 :             :     /*
    6691                 :             :      * If there were cascading standby servers connected to us, nudge any wal
    6692                 :             :      * sender processes to notice that we've been promoted.
    6693                 :             :      */
    6694                 :        1018 :     WalSndWakeup(true, true);
    6695                 :             : 
    6696                 :             :     /*
    6697                 :             :      * If this was a promotion, request an (online) checkpoint now. This isn't
    6698                 :             :      * required for consistency, but the last restartpoint might be far back,
    6699                 :             :      * and in case of a crash, recovering from it might take a longer than is
    6700                 :             :      * appropriate now that we're not in standby mode anymore.
    6701                 :             :      */
    6702         [ +  + ]:        1018 :     if (promoted)
    6703                 :          52 :         RequestCheckpoint(CHECKPOINT_FORCE);
    6704                 :        1018 : }
    6705                 :             : 
    6706                 :             : /*
    6707                 :             :  * Callback from PerformWalRecovery(), called when we switch from crash
    6708                 :             :  * recovery to archive recovery mode.  Updates the control file accordingly.
    6709                 :             :  */
    6710                 :             : void
    6711                 :           1 : SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
    6712                 :             : {
    6713                 :             :     /* initialize minRecoveryPoint to this record */
    6714                 :           1 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6715                 :           1 :     ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
    6716         [ +  - ]:           1 :     if (ControlFile->minRecoveryPoint < EndRecPtr)
    6717                 :             :     {
    6718                 :           1 :         ControlFile->minRecoveryPoint = EndRecPtr;
    6719                 :           1 :         ControlFile->minRecoveryPointTLI = replayTLI;
    6720                 :             :     }
    6721                 :             :     /* update local copy */
    6722                 :           1 :     LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    6723                 :           1 :     LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    6724                 :             : 
    6725                 :             :     /*
    6726                 :             :      * The startup process can update its local copy of minRecoveryPoint from
    6727                 :             :      * this point.
    6728                 :             :      */
    6729                 :           1 :     updateMinRecoveryPoint = true;
    6730                 :             : 
    6731                 :           1 :     UpdateControlFile();
    6732                 :             : 
    6733                 :             :     /*
    6734                 :             :      * We update SharedRecoveryState while holding the lock on ControlFileLock
    6735                 :             :      * so both states are consistent in shared memory.
    6736                 :             :      */
    6737                 :           1 :     SpinLockAcquire(&XLogCtl->info_lck);
    6738                 :           1 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
    6739                 :           1 :     SpinLockRelease(&XLogCtl->info_lck);
    6740                 :             : 
    6741                 :           1 :     LWLockRelease(ControlFileLock);
    6742                 :           1 : }
    6743                 :             : 
    6744                 :             : /*
    6745                 :             :  * Callback from PerformWalRecovery(), called when we reach the end of backup.
    6746                 :             :  * Updates the control file accordingly.
    6747                 :             :  */
    6748                 :             : void
    6749                 :          88 : ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
    6750                 :             : {
    6751                 :             :     /*
    6752                 :             :      * We have reached the end of base backup, as indicated by pg_control. The
    6753                 :             :      * data on disk is now consistent (unless minRecoveryPoint is further
    6754                 :             :      * ahead, which can happen if we crashed during previous recovery).  Reset
    6755                 :             :      * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
    6756                 :             :      * make sure we don't allow starting up at an earlier point even if
    6757                 :             :      * recovery is stopped and restarted soon after this.
    6758                 :             :      */
    6759                 :          88 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6760                 :             : 
    6761         [ +  + ]:          88 :     if (ControlFile->minRecoveryPoint < EndRecPtr)
    6762                 :             :     {
    6763                 :          81 :         ControlFile->minRecoveryPoint = EndRecPtr;
    6764                 :          81 :         ControlFile->minRecoveryPointTLI = tli;
    6765                 :             :     }
    6766                 :             : 
    6767                 :          88 :     ControlFile->backupStartPoint = InvalidXLogRecPtr;
    6768                 :          88 :     ControlFile->backupEndPoint = InvalidXLogRecPtr;
    6769                 :          88 :     ControlFile->backupEndRequired = false;
    6770                 :          88 :     UpdateControlFile();
    6771                 :             : 
    6772                 :          88 :     LWLockRelease(ControlFileLock);
    6773                 :          88 : }
    6774                 :             : 
    6775                 :             : /*
    6776                 :             :  * Perform whatever XLOG actions are necessary at end of REDO.
    6777                 :             :  *
    6778                 :             :  * The goal here is to make sure that we'll be able to recover properly if
    6779                 :             :  * we crash again. If we choose to write a checkpoint, we'll write a shutdown
    6780                 :             :  * checkpoint rather than an on-line one. This is not particularly critical,
    6781                 :             :  * but since we may be assigning a new TLI, using a shutdown checkpoint allows
    6782                 :             :  * us to have the rule that TLI only changes in shutdown checkpoints, which
    6783                 :             :  * allows some extra error checking in xlog_redo.
    6784                 :             :  */
    6785                 :             : static bool
    6786                 :         167 : PerformRecoveryXLogAction(void)
    6787                 :             : {
    6788                 :         167 :     bool        promoted = false;
    6789                 :             : 
    6790                 :             :     /*
    6791                 :             :      * Perform a checkpoint to update all our recovery activity to disk.
    6792                 :             :      *
    6793                 :             :      * Note that we write a shutdown checkpoint rather than an on-line one.
    6794                 :             :      * This is not particularly critical, but since we may be assigning a new
    6795                 :             :      * TLI, using a shutdown checkpoint allows us to have the rule that TLI
    6796                 :             :      * only changes in shutdown checkpoints, which allows some extra error
    6797                 :             :      * checking in xlog_redo.
    6798                 :             :      *
    6799                 :             :      * In promotion, only create a lightweight end-of-recovery record instead
    6800                 :             :      * of a full checkpoint. A checkpoint is requested later, after we're
    6801                 :             :      * fully out of recovery mode and already accepting queries.
    6802                 :             :      */
    6803   [ +  +  +  -  :         226 :     if (ArchiveRecoveryRequested && IsUnderPostmaster &&
                   +  + ]
    6804                 :          59 :         PromoteIsTriggered())
    6805                 :             :     {
    6806                 :          52 :         promoted = true;
    6807                 :             : 
    6808                 :             :         /*
    6809                 :             :          * Insert a special WAL record to mark the end of recovery, since we
    6810                 :             :          * aren't doing a checkpoint. That means that the checkpointer process
    6811                 :             :          * may likely be in the middle of a time-smoothed restartpoint and
    6812                 :             :          * could continue to be for minutes after this.  That sounds strange,
    6813                 :             :          * but the effect is roughly the same and it would be stranger to try
    6814                 :             :          * to come out of the restartpoint and then checkpoint. We request a
    6815                 :             :          * checkpoint later anyway, just for safety.
    6816                 :             :          */
    6817                 :          52 :         CreateEndOfRecoveryRecord();
    6818                 :             :     }
    6819                 :             :     else
    6820                 :             :     {
    6821                 :         115 :         RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
    6822                 :             :                           CHECKPOINT_FAST |
    6823                 :             :                           CHECKPOINT_WAIT);
    6824                 :             :     }
    6825                 :             : 
    6826                 :         167 :     return promoted;
    6827                 :             : }
    6828                 :             : 
    6829                 :             : /*
    6830                 :             :  * Is the system still in recovery?
    6831                 :             :  *
    6832                 :             :  * Unlike testing InRecovery, this works in any process that's connected to
    6833                 :             :  * shared memory.
    6834                 :             :  */
    6835                 :             : bool
    6836                 :    98017585 : RecoveryInProgress(void)
    6837                 :             : {
    6838                 :             :     /*
    6839                 :             :      * We check shared state each time only until we leave recovery mode. We
    6840                 :             :      * can't re-enter recovery, so there's no need to keep checking after the
    6841                 :             :      * shared variable has once been seen false.
    6842                 :             :      */
    6843         [ +  + ]:    98017585 :     if (!LocalRecoveryInProgress)
    6844                 :    96166979 :         return false;
    6845                 :             :     else
    6846                 :             :     {
    6847                 :             :         /*
    6848                 :             :          * use volatile pointer to make sure we make a fresh read of the
    6849                 :             :          * shared variable.
    6850                 :             :          */
    6851                 :     1850606 :         volatile XLogCtlData *xlogctl = XLogCtl;
    6852                 :             : 
    6853                 :     1850606 :         LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
    6854                 :             : 
    6855                 :             :         /*
    6856                 :             :          * Note: We don't need a memory barrier when we're still in recovery.
    6857                 :             :          * We might exit recovery immediately after return, so the caller
    6858                 :             :          * can't rely on 'true' meaning that we're still in recovery anyway.
    6859                 :             :          */
    6860                 :             : 
    6861                 :     1850606 :         return LocalRecoveryInProgress;
    6862                 :             :     }
    6863                 :             : }
    6864                 :             : 
    6865                 :             : /*
    6866                 :             :  * Returns current recovery state from shared memory.
    6867                 :             :  *
    6868                 :             :  * This returned state is kept consistent with the contents of the control
    6869                 :             :  * file.  See details about the possible values of RecoveryState in xlog.h.
    6870                 :             :  */
    6871                 :             : RecoveryState
    6872                 :       27091 : GetRecoveryState(void)
    6873                 :             : {
    6874                 :             :     RecoveryState retval;
    6875                 :             : 
    6876                 :       27091 :     SpinLockAcquire(&XLogCtl->info_lck);
    6877                 :       27091 :     retval = XLogCtl->SharedRecoveryState;
    6878                 :       27091 :     SpinLockRelease(&XLogCtl->info_lck);
    6879                 :             : 
    6880                 :       27091 :     return retval;
    6881                 :             : }
    6882                 :             : 
    6883                 :             : /*
    6884                 :             :  * Is this process allowed to insert new WAL records?
    6885                 :             :  *
    6886                 :             :  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
    6887                 :             :  * But we also have provisions for forcing the result "true" or "false"
    6888                 :             :  * within specific processes regardless of the global state.
    6889                 :             :  */
    6890                 :             : bool
    6891                 :    67498962 : XLogInsertAllowed(void)
    6892                 :             : {
    6893                 :             :     /*
    6894                 :             :      * If value is "unconditionally true" or "unconditionally false", just
    6895                 :             :      * return it.  This provides the normal fast path once recovery is known
    6896                 :             :      * done.
    6897                 :             :      */
    6898         [ +  + ]:    67498962 :     if (LocalXLogInsertAllowed >= 0)
    6899                 :    66803462 :         return (bool) LocalXLogInsertAllowed;
    6900                 :             : 
    6901                 :             :     /*
    6902                 :             :      * Else, must check to see if we're still in recovery.
    6903                 :             :      */
    6904         [ +  + ]:      695500 :     if (RecoveryInProgress())
    6905                 :      684874 :         return false;
    6906                 :             : 
    6907                 :             :     /*
    6908                 :             :      * On exit from recovery, reset to "unconditionally true", since there is
    6909                 :             :      * no need to keep checking.
    6910                 :             :      */
    6911                 :       10626 :     LocalXLogInsertAllowed = 1;
    6912                 :       10626 :     return true;
    6913                 :             : }
    6914                 :             : 
    6915                 :             : /*
    6916                 :             :  * Make XLogInsertAllowed() return true in the current process only.
    6917                 :             :  *
    6918                 :             :  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
    6919                 :             :  * and even call LocalSetXLogInsertAllowed() again after that.
    6920                 :             :  *
    6921                 :             :  * Returns the previous value of LocalXLogInsertAllowed.
    6922                 :             :  */
    6923                 :             : static int
    6924                 :        1048 : LocalSetXLogInsertAllowed(void)
    6925                 :             : {
    6926                 :        1048 :     int         oldXLogAllowed = LocalXLogInsertAllowed;
    6927                 :             : 
    6928                 :        1048 :     LocalXLogInsertAllowed = 1;
    6929                 :             : 
    6930                 :        1048 :     return oldXLogAllowed;
    6931                 :             : }
    6932                 :             : 
    6933                 :             : /*
    6934                 :             :  * Return the current Redo pointer from shared memory.
    6935                 :             :  *
    6936                 :             :  * As a side-effect, the local RedoRecPtr copy is updated.
    6937                 :             :  */
    6938                 :             : XLogRecPtr
    6939                 :      367757 : GetRedoRecPtr(void)
    6940                 :             : {
    6941                 :             :     XLogRecPtr  ptr;
    6942                 :             : 
    6943                 :             :     /*
    6944                 :             :      * The possibly not up-to-date copy in XLogCtl is enough. Even if we
    6945                 :             :      * grabbed a WAL insertion lock to read the authoritative value in
    6946                 :             :      * Insert->RedoRecPtr, someone might update it just after we've released
    6947                 :             :      * the lock.
    6948                 :             :      */
    6949                 :      367757 :     SpinLockAcquire(&XLogCtl->info_lck);
    6950                 :      367757 :     ptr = XLogCtl->RedoRecPtr;
    6951                 :      367757 :     SpinLockRelease(&XLogCtl->info_lck);
    6952                 :             : 
    6953         [ +  + ]:      367757 :     if (RedoRecPtr < ptr)
    6954                 :        1703 :         RedoRecPtr = ptr;
    6955                 :             : 
    6956                 :      367757 :     return RedoRecPtr;
    6957                 :             : }
    6958                 :             : 
    6959                 :             : /*
    6960                 :             :  * Return information needed to decide whether a modified block needs a
    6961                 :             :  * full-page image to be included in the WAL record.
    6962                 :             :  *
    6963                 :             :  * The returned values are cached copies from backend-private memory, and
    6964                 :             :  * possibly out-of-date or, indeed, uninitialized, in which case they will
    6965                 :             :  * be InvalidXLogRecPtr and false, respectively.  XLogInsertRecord will
    6966                 :             :  * re-check them against up-to-date values, while holding the WAL insert lock.
    6967                 :             :  */
    6968                 :             : void
    6969                 :    25005323 : GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
    6970                 :             : {
    6971                 :    25005323 :     *RedoRecPtr_p = RedoRecPtr;
    6972                 :    25005323 :     *doPageWrites_p = doPageWrites;
    6973                 :    25005323 : }
    6974                 :             : 
    6975                 :             : /*
    6976                 :             :  * GetInsertRecPtr -- Returns the current insert position.
    6977                 :             :  *
    6978                 :             :  * NOTE: The value *actually* returned is the position of the last full
    6979                 :             :  * xlog page. It lags behind the real insert position by at most 1 page.
    6980                 :             :  * For that, we don't need to scan through WAL insertion locks, and an
    6981                 :             :  * approximation is enough for the current usage of this function.
    6982                 :             :  */
    6983                 :             : XLogRecPtr
    6984                 :        7734 : GetInsertRecPtr(void)
    6985                 :             : {
    6986                 :             :     XLogRecPtr  recptr;
    6987                 :             : 
    6988                 :        7734 :     SpinLockAcquire(&XLogCtl->info_lck);
    6989                 :        7734 :     recptr = XLogCtl->LogwrtRqst.Write;
    6990                 :        7734 :     SpinLockRelease(&XLogCtl->info_lck);
    6991                 :             : 
    6992                 :        7734 :     return recptr;
    6993                 :             : }
    6994                 :             : 
    6995                 :             : /*
    6996                 :             :  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
    6997                 :             :  * position known to be fsync'd to disk. This should only be used on a
    6998                 :             :  * system that is known not to be in recovery.
    6999                 :             :  */
    7000                 :             : XLogRecPtr
    7001                 :      224215 : GetFlushRecPtr(TimeLineID *insertTLI)
    7002                 :             : {
    7003                 :             :     Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
    7004                 :             : 
    7005                 :      224215 :     RefreshXLogWriteResult(LogwrtResult);
    7006                 :             : 
    7007                 :             :     /*
    7008                 :             :      * If we're writing and flushing WAL, the time line can't be changing, so
    7009                 :             :      * no lock is required.
    7010                 :             :      */
    7011         [ +  + ]:      224215 :     if (insertTLI)
    7012                 :       50103 :         *insertTLI = XLogCtl->InsertTimeLineID;
    7013                 :             : 
    7014                 :      224215 :     return LogwrtResult.Flush;
    7015                 :             : }
    7016                 :             : 
    7017                 :             : /*
    7018                 :             :  * GetWALInsertionTimeLine -- Returns the current timeline of a system that
    7019                 :             :  * is not in recovery.
    7020                 :             :  */
    7021                 :             : TimeLineID
    7022                 :      120192 : GetWALInsertionTimeLine(void)
    7023                 :             : {
    7024                 :             :     Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
    7025                 :             : 
    7026                 :             :     /* Since the value can't be changing, no lock is required. */
    7027                 :      120192 :     return XLogCtl->InsertTimeLineID;
    7028                 :             : }
    7029                 :             : 
    7030                 :             : /*
    7031                 :             :  * GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns
    7032                 :             :  * the WAL insertion timeline; else, returns 0. Wherever possible, use
    7033                 :             :  * GetWALInsertionTimeLine() instead, since it's cheaper. Note that this
    7034                 :             :  * function decides recovery has ended as soon as the insert TLI is set, which
    7035                 :             :  * happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE.
    7036                 :             :  */
    7037                 :             : TimeLineID
    7038                 :         939 : GetWALInsertionTimeLineIfSet(void)
    7039                 :             : {
    7040                 :             :     TimeLineID  insertTLI;
    7041                 :             : 
    7042                 :         939 :     SpinLockAcquire(&XLogCtl->info_lck);
    7043                 :         939 :     insertTLI = XLogCtl->InsertTimeLineID;
    7044                 :         939 :     SpinLockRelease(&XLogCtl->info_lck);
    7045                 :             : 
    7046                 :         939 :     return insertTLI;
    7047                 :             : }
    7048                 :             : 
    7049                 :             : /*
    7050                 :             :  * GetLastImportantRecPtr -- Returns the LSN of the last important record
    7051                 :             :  * inserted. All records not explicitly marked as unimportant are considered
    7052                 :             :  * important.
    7053                 :             :  *
    7054                 :             :  * The LSN is determined by computing the maximum of
    7055                 :             :  * WALInsertLocks[i].lastImportantAt.
    7056                 :             :  */
    7057                 :             : XLogRecPtr
    7058                 :        1790 : GetLastImportantRecPtr(void)
    7059                 :             : {
    7060                 :        1790 :     XLogRecPtr  res = InvalidXLogRecPtr;
    7061                 :             :     int         i;
    7062                 :             : 
    7063         [ +  + ]:       16110 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    7064                 :             :     {
    7065                 :             :         XLogRecPtr  last_important;
    7066                 :             : 
    7067                 :             :         /*
    7068                 :             :          * Need to take a lock to prevent torn reads of the LSN, which are
    7069                 :             :          * possible on some of the supported platforms. WAL insert locks only
    7070                 :             :          * support exclusive mode, so we have to use that.
    7071                 :             :          */
    7072                 :       14320 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    7073                 :       14320 :         last_important = WALInsertLocks[i].l.lastImportantAt;
    7074                 :       14320 :         LWLockRelease(&WALInsertLocks[i].l.lock);
    7075                 :             : 
    7076         [ +  + ]:       14320 :         if (res < last_important)
    7077                 :        3020 :             res = last_important;
    7078                 :             :     }
    7079                 :             : 
    7080                 :        1790 :     return res;
    7081                 :             : }
    7082                 :             : 
    7083                 :             : /*
    7084                 :             :  * Get the time and LSN of the last xlog segment switch
    7085                 :             :  */
    7086                 :             : pg_time_t
    7087                 :           0 : GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
    7088                 :             : {
    7089                 :             :     pg_time_t   result;
    7090                 :             : 
    7091                 :             :     /* Need WALWriteLock, but shared lock is sufficient */
    7092                 :           0 :     LWLockAcquire(WALWriteLock, LW_SHARED);
    7093                 :           0 :     result = XLogCtl->lastSegSwitchTime;
    7094                 :           0 :     *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
    7095                 :           0 :     LWLockRelease(WALWriteLock);
    7096                 :             : 
    7097                 :           0 :     return result;
    7098                 :             : }
    7099                 :             : 
    7100                 :             : /*
    7101                 :             :  * This must be called ONCE during postmaster or standalone-backend shutdown
    7102                 :             :  */
    7103                 :             : void
    7104                 :         769 : ShutdownXLOG(int code, Datum arg)
    7105                 :             : {
    7106                 :             :     /*
    7107                 :             :      * We should have an aux process resource owner to use, and we should not
    7108                 :             :      * be in a transaction that's installed some other resowner.
    7109                 :             :      */
    7110                 :             :     Assert(AuxProcessResourceOwner != NULL);
    7111                 :             :     Assert(CurrentResourceOwner == NULL ||
    7112                 :             :            CurrentResourceOwner == AuxProcessResourceOwner);
    7113                 :         769 :     CurrentResourceOwner = AuxProcessResourceOwner;
    7114                 :             : 
    7115                 :             :     /* Don't be chatty in standalone mode */
    7116   [ +  +  +  + ]:         769 :     ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    7117                 :             :             (errmsg("shutting down")));
    7118                 :             : 
    7119                 :             :     /*
    7120                 :             :      * Signal walsenders to move to stopping state.
    7121                 :             :      */
    7122                 :         769 :     WalSndInitStopping();
    7123                 :             : 
    7124                 :             :     /*
    7125                 :             :      * Wait for WAL senders to be in stopping state.  This prevents commands
    7126                 :             :      * from writing new WAL.
    7127                 :             :      */
    7128                 :         769 :     WalSndWaitStopping();
    7129                 :             : 
    7130         [ +  + ]:         769 :     if (RecoveryInProgress())
    7131                 :          63 :         CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST);
    7132                 :             :     else
    7133                 :             :     {
    7134                 :             :         /*
    7135                 :             :          * If archiving is enabled, rotate the last XLOG file so that all the
    7136                 :             :          * remaining records are archived (postmaster wakes up the archiver
    7137                 :             :          * process one more time at the end of shutdown). The checkpoint
    7138                 :             :          * record will go to the next XLOG file and won't be archived (yet).
    7139                 :             :          */
    7140         [ +  + ]:         706 :         if (XLogArchivingActive())
    7141                 :          18 :             RequestXLogSwitch(false);
    7142                 :             : 
    7143                 :         706 :         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST);
    7144                 :             :     }
    7145                 :         769 : }
    7146                 :             : 
    7147                 :             : /*
    7148                 :             :  * Format checkpoint request flags as a space-separated string for
    7149                 :             :  * log messages.
    7150                 :             :  */
    7151                 :             : static const char *
    7152                 :        3220 : CheckpointFlagsString(int flags)
    7153                 :             : {
    7154                 :             :     static char buf[128];
    7155                 :             : 
    7156                 :       25760 :     snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s",
    7157         [ +  + ]:        3220 :              (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
    7158         [ +  + ]:        3220 :              (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
    7159         [ +  + ]:        3220 :              (flags & CHECKPOINT_FAST) ? " fast" : "",
    7160         [ +  + ]:        3220 :              (flags & CHECKPOINT_FORCE) ? " force" : "",
    7161         [ +  + ]:        3220 :              (flags & CHECKPOINT_WAIT) ? " wait" : "",
    7162         [ +  + ]:        3220 :              (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
    7163         [ +  + ]:        3220 :              (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
    7164         [ +  + ]:        3220 :              (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : "");
    7165                 :             : 
    7166                 :        3220 :     return buf;
    7167                 :             : }
    7168                 :             : 
    7169                 :             : /*
    7170                 :             :  * Log start of a checkpoint.
    7171                 :             :  */
    7172                 :             : static void
    7173                 :        1610 : LogCheckpointStart(int flags, bool restartpoint)
    7174                 :             : {
    7175         [ +  + ]:        1610 :     if (restartpoint)
    7176         [ +  - ]:         209 :         ereport(LOG,
    7177                 :             :         /* translator: the placeholder shows checkpoint options */
    7178                 :             :                 (errmsg("restartpoint starting:%s",
    7179                 :             :                         CheckpointFlagsString(flags))));
    7180                 :             :     else
    7181         [ +  - ]:        1401 :         ereport(LOG,
    7182                 :             :         /* translator: the placeholder shows checkpoint options */
    7183                 :             :                 (errmsg("checkpoint starting:%s",
    7184                 :             :                         CheckpointFlagsString(flags))));
    7185                 :        1610 : }
    7186                 :             : 
    7187                 :             : /*
    7188                 :             :  * Log end of a checkpoint.
    7189                 :             :  */
    7190                 :             : static void
    7191                 :        1944 : LogCheckpointEnd(bool restartpoint, int flags)
    7192                 :             : {
    7193                 :             :     long        write_msecs,
    7194                 :             :                 sync_msecs,
    7195                 :             :                 total_msecs,
    7196                 :             :                 longest_msecs,
    7197                 :             :                 average_msecs;
    7198                 :             :     uint64      average_sync_time;
    7199                 :             : 
    7200                 :        1944 :     CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
    7201                 :             : 
    7202                 :        1944 :     write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
    7203                 :             :                                                   CheckpointStats.ckpt_sync_t);
    7204                 :             : 
    7205                 :        1944 :     sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
    7206                 :             :                                                  CheckpointStats.ckpt_sync_end_t);
    7207                 :             : 
    7208                 :             :     /* Accumulate checkpoint timing summary data, in milliseconds. */
    7209                 :        1944 :     PendingCheckpointerStats.write_time += write_msecs;
    7210                 :        1944 :     PendingCheckpointerStats.sync_time += sync_msecs;
    7211                 :             : 
    7212                 :             :     /*
    7213                 :             :      * All of the published timing statistics are accounted for.  Only
    7214                 :             :      * continue if a log message is to be written.
    7215                 :             :      */
    7216         [ +  + ]:        1944 :     if (!log_checkpoints)
    7217                 :         334 :         return;
    7218                 :             : 
    7219                 :        1610 :     total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
    7220                 :             :                                                   CheckpointStats.ckpt_end_t);
    7221                 :             : 
    7222                 :             :     /*
    7223                 :             :      * Timing values returned from CheckpointStats are in microseconds.
    7224                 :             :      * Convert to milliseconds for consistent printing.
    7225                 :             :      */
    7226                 :        1610 :     longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
    7227                 :             : 
    7228                 :        1610 :     average_sync_time = 0;
    7229         [ -  + ]:        1610 :     if (CheckpointStats.ckpt_sync_rels > 0)
    7230                 :           0 :         average_sync_time = CheckpointStats.ckpt_agg_sync_time /
    7231                 :           0 :             CheckpointStats.ckpt_sync_rels;
    7232                 :        1610 :     average_msecs = (long) ((average_sync_time + 999) / 1000);
    7233                 :             : 
    7234                 :             :     /*
    7235                 :             :      * ControlFileLock is not required to see ControlFile->checkPoint and
    7236                 :             :      * ->checkPointCopy here as we are the only updator of those variables at
    7237                 :             :      * this moment.
    7238                 :             :      */
    7239         [ +  + ]:        1610 :     if (restartpoint)
    7240         [ +  - ]:         209 :         ereport(LOG,
    7241                 :             :                 (errmsg("restartpoint complete:%s: wrote %d buffers (%.1f%%), "
    7242                 :             :                         "wrote %d SLRU buffers; %d WAL file(s) added, "
    7243                 :             :                         "%d removed, %d recycled; write=%ld.%03d s, "
    7244                 :             :                         "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
    7245                 :             :                         "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
    7246                 :             :                         "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X",
    7247                 :             :                         CheckpointFlagsString(flags),
    7248                 :             :                         CheckpointStats.ckpt_bufs_written,
    7249                 :             :                         (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
    7250                 :             :                         CheckpointStats.ckpt_slru_written,
    7251                 :             :                         CheckpointStats.ckpt_segs_added,
    7252                 :             :                         CheckpointStats.ckpt_segs_removed,
    7253                 :             :                         CheckpointStats.ckpt_segs_recycled,
    7254                 :             :                         write_msecs / 1000, (int) (write_msecs % 1000),
    7255                 :             :                         sync_msecs / 1000, (int) (sync_msecs % 1000),
    7256                 :             :                         total_msecs / 1000, (int) (total_msecs % 1000),
    7257                 :             :                         CheckpointStats.ckpt_sync_rels,
    7258                 :             :                         longest_msecs / 1000, (int) (longest_msecs % 1000),
    7259                 :             :                         average_msecs / 1000, (int) (average_msecs % 1000),
    7260                 :             :                         (int) (PrevCheckPointDistance / 1024.0),
    7261                 :             :                         (int) (CheckPointDistanceEstimate / 1024.0),
    7262                 :             :                         LSN_FORMAT_ARGS(ControlFile->checkPoint),
    7263                 :             :                         LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
    7264                 :             :     else
    7265         [ +  - ]:        1401 :         ereport(LOG,
    7266                 :             :                 (errmsg("checkpoint complete:%s: wrote %d buffers (%.1f%%), "
    7267                 :             :                         "wrote %d SLRU buffers; %d WAL file(s) added, "
    7268                 :             :                         "%d removed, %d recycled; write=%ld.%03d s, "
    7269                 :             :                         "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
    7270                 :             :                         "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
    7271                 :             :                         "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X",
    7272                 :             :                         CheckpointFlagsString(flags),
    7273                 :             :                         CheckpointStats.ckpt_bufs_written,
    7274                 :             :                         (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
    7275                 :             :                         CheckpointStats.ckpt_slru_written,
    7276                 :             :                         CheckpointStats.ckpt_segs_added,
    7277                 :             :                         CheckpointStats.ckpt_segs_removed,
    7278                 :             :                         CheckpointStats.ckpt_segs_recycled,
    7279                 :             :                         write_msecs / 1000, (int) (write_msecs % 1000),
    7280                 :             :                         sync_msecs / 1000, (int) (sync_msecs % 1000),
    7281                 :             :                         total_msecs / 1000, (int) (total_msecs % 1000),
    7282                 :             :                         CheckpointStats.ckpt_sync_rels,
    7283                 :             :                         longest_msecs / 1000, (int) (longest_msecs % 1000),
    7284                 :             :                         average_msecs / 1000, (int) (average_msecs % 1000),
    7285                 :             :                         (int) (PrevCheckPointDistance / 1024.0),
    7286                 :             :                         (int) (CheckPointDistanceEstimate / 1024.0),
    7287                 :             :                         LSN_FORMAT_ARGS(ControlFile->checkPoint),
    7288                 :             :                         LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
    7289                 :             : }
    7290                 :             : 
    7291                 :             : /*
    7292                 :             :  * Update the estimate of distance between checkpoints.
    7293                 :             :  *
    7294                 :             :  * The estimate is used to calculate the number of WAL segments to keep
    7295                 :             :  * preallocated, see XLOGfileslop().
    7296                 :             :  */
    7297                 :             : static void
    7298                 :        1944 : UpdateCheckPointDistanceEstimate(uint64 nbytes)
    7299                 :             : {
    7300                 :             :     /*
    7301                 :             :      * To estimate the number of segments consumed between checkpoints, keep a
    7302                 :             :      * moving average of the amount of WAL generated in previous checkpoint
    7303                 :             :      * cycles. However, if the load is bursty, with quiet periods and busy
    7304                 :             :      * periods, we want to cater for the peak load. So instead of a plain
    7305                 :             :      * moving average, let the average decline slowly if the previous cycle
    7306                 :             :      * used less WAL than estimated, but bump it up immediately if it used
    7307                 :             :      * more.
    7308                 :             :      *
    7309                 :             :      * When checkpoints are triggered by max_wal_size, this should converge to
    7310                 :             :      * CheckpointSegments * wal_segment_size,
    7311                 :             :      *
    7312                 :             :      * Note: This doesn't pay any attention to what caused the checkpoint.
    7313                 :             :      * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
    7314                 :             :      * starting a base backup, are counted the same as those created
    7315                 :             :      * automatically. The slow-decline will largely mask them out, if they are
    7316                 :             :      * not frequent. If they are frequent, it seems reasonable to count them
    7317                 :             :      * in as any others; if you issue a manual checkpoint every 5 minutes and
    7318                 :             :      * never let a timed checkpoint happen, it makes sense to base the
    7319                 :             :      * preallocation on that 5 minute interval rather than whatever
    7320                 :             :      * checkpoint_timeout is set to.
    7321                 :             :      */
    7322                 :        1944 :     PrevCheckPointDistance = nbytes;
    7323         [ +  + ]:        1944 :     if (CheckPointDistanceEstimate < nbytes)
    7324                 :         866 :         CheckPointDistanceEstimate = nbytes;
    7325                 :             :     else
    7326                 :        1078 :         CheckPointDistanceEstimate =
    7327                 :        1078 :             (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
    7328                 :        1944 : }
    7329                 :             : 
    7330                 :             : /*
    7331                 :             :  * Update the ps display for a process running a checkpoint.  Note that
    7332                 :             :  * this routine should not do any allocations so as it can be called
    7333                 :             :  * from a critical section.
    7334                 :             :  */
    7335                 :             : static void
    7336                 :        3888 : update_checkpoint_display(int flags, bool restartpoint, bool reset)
    7337                 :             : {
    7338                 :             :     /*
    7339                 :             :      * The status is reported only for end-of-recovery and shutdown
    7340                 :             :      * checkpoints or shutdown restartpoints.  Updating the ps display is
    7341                 :             :      * useful in those situations as it may not be possible to rely on
    7342                 :             :      * pg_stat_activity to see the status of the checkpointer or the startup
    7343                 :             :      * process.
    7344                 :             :      */
    7345         [ +  + ]:        3888 :     if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
    7346                 :        2368 :         return;
    7347                 :             : 
    7348         [ +  + ]:        1520 :     if (reset)
    7349                 :         760 :         set_ps_display("");
    7350                 :             :     else
    7351                 :             :     {
    7352                 :             :         char        activitymsg[128];
    7353                 :             : 
    7354         [ +  + ]:        2280 :         snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
    7355         [ +  + ]:         760 :                  (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
    7356         [ +  + ]:         760 :                  (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
    7357                 :             :                  restartpoint ? "restartpoint" : "checkpoint");
    7358                 :         760 :         set_ps_display(activitymsg);
    7359                 :             :     }
    7360                 :             : }
    7361                 :             : 
    7362                 :             : 
    7363                 :             : /*
    7364                 :             :  * Perform a checkpoint --- either during shutdown, or on-the-fly
    7365                 :             :  *
    7366                 :             :  * flags is a bitwise OR of the following:
    7367                 :             :  *  CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
    7368                 :             :  *  CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
    7369                 :             :  *  CHECKPOINT_FAST: finish the checkpoint ASAP, ignoring
    7370                 :             :  *      checkpoint_completion_target parameter.
    7371                 :             :  *  CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
    7372                 :             :  *      since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
    7373                 :             :  *      CHECKPOINT_END_OF_RECOVERY).
    7374                 :             :  *  CHECKPOINT_FLUSH_UNLOGGED: also flush buffers of unlogged tables.
    7375                 :             :  *
    7376                 :             :  * Note: flags contains other bits, of interest here only for logging purposes.
    7377                 :             :  * In particular note that this routine is synchronous and does not pay
    7378                 :             :  * attention to CHECKPOINT_WAIT.
    7379                 :             :  *
    7380                 :             :  * If !shutdown then we are writing an online checkpoint. An XLOG_CHECKPOINT_REDO
    7381                 :             :  * record is inserted into WAL at the logical location of the checkpoint, before
    7382                 :             :  * flushing anything to disk, and when the checkpoint is eventually completed,
    7383                 :             :  * and it is from this point that WAL replay will begin in the case of a recovery
    7384                 :             :  * from this checkpoint. Once everything is written to disk, an
    7385                 :             :  * XLOG_CHECKPOINT_ONLINE record is written to complete the checkpoint, and
    7386                 :             :  * points back to the earlier XLOG_CHECKPOINT_REDO record. This mechanism allows
    7387                 :             :  * other write-ahead log records to be written while the checkpoint is in
    7388                 :             :  * progress, but we must be very careful about order of operations. This function
    7389                 :             :  * may take many minutes to execute on a busy system.
    7390                 :             :  *
    7391                 :             :  * On the other hand, when shutdown is true, concurrent insertion into the
    7392                 :             :  * write-ahead log is impossible, so there is no need for two separate records.
    7393                 :             :  * In this case, we only insert an XLOG_CHECKPOINT_SHUTDOWN record, and it's
    7394                 :             :  * both the record marking the completion of the checkpoint and the location
    7395                 :             :  * from which WAL replay would begin if needed.
    7396                 :             :  *
    7397                 :             :  * Returns true if a new checkpoint was performed, or false if it was skipped
    7398                 :             :  * because the system was idle.
    7399                 :             :  */
    7400                 :             : bool
    7401                 :        1735 : CreateCheckPoint(int flags)
    7402                 :             : {
    7403                 :             :     bool        shutdown;
    7404                 :             :     CheckPoint  checkPoint;
    7405                 :             :     XLogRecPtr  recptr;
    7406                 :             :     XLogSegNo   _logSegNo;
    7407                 :        1735 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    7408                 :             :     uint32      freespace;
    7409                 :             :     XLogRecPtr  PriorRedoPtr;
    7410                 :             :     XLogRecPtr  last_important_lsn;
    7411                 :             :     VirtualTransactionId *vxids;
    7412                 :             :     int         nvxids;
    7413                 :        1735 :     int         oldXLogAllowed = 0;
    7414                 :             : 
    7415                 :             :     /*
    7416                 :             :      * An end-of-recovery checkpoint is really a shutdown checkpoint, just
    7417                 :             :      * issued at a different time.
    7418                 :             :      */
    7419         [ +  + ]:        1735 :     if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
    7420                 :         736 :         shutdown = true;
    7421                 :             :     else
    7422                 :         999 :         shutdown = false;
    7423                 :             : 
    7424                 :             :     /* sanity check */
    7425   [ +  +  -  + ]:        1735 :     if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
    7426         [ #  # ]:           0 :         elog(ERROR, "can't create a checkpoint during recovery");
    7427                 :             : 
    7428                 :             :     /*
    7429                 :             :      * Prepare to accumulate statistics.
    7430                 :             :      *
    7431                 :             :      * Note: because it is possible for log_checkpoints to change while a
    7432                 :             :      * checkpoint proceeds, we always accumulate stats, even if
    7433                 :             :      * log_checkpoints is currently off.
    7434                 :             :      */
    7435   [ +  -  +  -  :       19085 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
          +  -  +  -  +  
                      + ]
    7436                 :        1735 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    7437                 :             : 
    7438                 :             :     /*
    7439                 :             :      * Let smgr prepare for checkpoint; this has to happen outside the
    7440                 :             :      * critical section and before we determine the REDO pointer.  Note that
    7441                 :             :      * smgr must not do anything that'd have to be undone if we decide no
    7442                 :             :      * checkpoint is needed.
    7443                 :             :      */
    7444                 :        1735 :     SyncPreCheckpoint();
    7445                 :             : 
    7446                 :             :     /* Run these points outside the critical section. */
    7447                 :        1735 :     INJECTION_POINT("create-checkpoint-initial", NULL);
    7448                 :        1735 :     INJECTION_POINT_LOAD("create-checkpoint-run");
    7449                 :             : 
    7450                 :             :     /*
    7451                 :             :      * Use a critical section to force system panic if we have trouble.
    7452                 :             :      */
    7453                 :        1735 :     START_CRIT_SECTION();
    7454                 :             : 
    7455         [ +  + ]:        1735 :     if (shutdown)
    7456                 :             :     {
    7457                 :         736 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7458                 :         736 :         ControlFile->state = DB_SHUTDOWNING;
    7459                 :         736 :         UpdateControlFile();
    7460                 :         736 :         LWLockRelease(ControlFileLock);
    7461                 :             :     }
    7462                 :             : 
    7463                 :             :     /* Begin filling in the checkpoint WAL record */
    7464   [ +  -  +  -  :       22555 :     MemSet(&checkPoint, 0, sizeof(checkPoint));
          +  -  +  -  +  
                      + ]
    7465                 :        1735 :     checkPoint.time = (pg_time_t) time(NULL);
    7466                 :             : 
    7467                 :             :     /*
    7468                 :             :      * For Hot Standby, derive the oldestActiveXid before we fix the redo
    7469                 :             :      * pointer. This allows us to begin accumulating changes to assemble our
    7470                 :             :      * starting snapshot of locks and transactions.
    7471                 :             :      */
    7472   [ +  +  +  + ]:        1735 :     if (!shutdown && XLogStandbyInfoActive())
    7473                 :         957 :         checkPoint.oldestActiveXid = GetOldestActiveTransactionId(false, true);
    7474                 :             :     else
    7475                 :         778 :         checkPoint.oldestActiveXid = InvalidTransactionId;
    7476                 :             : 
    7477                 :             :     /*
    7478                 :             :      * Get location of last important record before acquiring insert locks (as
    7479                 :             :      * GetLastImportantRecPtr() also locks WAL locks).
    7480                 :             :      */
    7481                 :        1735 :     last_important_lsn = GetLastImportantRecPtr();
    7482                 :             : 
    7483                 :             :     /*
    7484                 :             :      * If this isn't a shutdown or forced checkpoint, and if there has been no
    7485                 :             :      * WAL activity requiring a checkpoint, skip it.  The idea here is to
    7486                 :             :      * avoid inserting duplicate checkpoints when the system is idle.
    7487                 :             :      */
    7488         [ +  + ]:        1735 :     if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
    7489                 :             :                   CHECKPOINT_FORCE)) == 0)
    7490                 :             :     {
    7491         [ -  + ]:         208 :         if (last_important_lsn == ControlFile->checkPoint)
    7492                 :             :         {
    7493                 :           0 :             END_CRIT_SECTION();
    7494         [ #  # ]:           0 :             ereport(DEBUG1,
    7495                 :             :                     (errmsg_internal("checkpoint skipped because system is idle")));
    7496                 :           0 :             return false;
    7497                 :             :         }
    7498                 :             :     }
    7499                 :             : 
    7500                 :             :     /*
    7501                 :             :      * An end-of-recovery checkpoint is created before anyone is allowed to
    7502                 :             :      * write WAL. To allow us to write the checkpoint record, temporarily
    7503                 :             :      * enable XLogInsertAllowed.
    7504                 :             :      */
    7505         [ +  + ]:        1735 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    7506                 :          30 :         oldXLogAllowed = LocalSetXLogInsertAllowed();
    7507                 :             : 
    7508                 :        1735 :     checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
    7509         [ +  + ]:        1735 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    7510                 :          30 :         checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    7511                 :             :     else
    7512                 :        1705 :         checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
    7513                 :             : 
    7514                 :             :     /*
    7515                 :             :      * We must block concurrent insertions while examining insert state.
    7516                 :             :      */
    7517                 :        1735 :     WALInsertLockAcquireExclusive();
    7518                 :             : 
    7519                 :        1735 :     checkPoint.fullPageWrites = Insert->fullPageWrites;
    7520                 :        1735 :     checkPoint.wal_level = wal_level;
    7521                 :             : 
    7522                 :             :     /*
    7523                 :             :      * Get the current data_checksum_version value from xlogctl, valid at the
    7524                 :             :      * time of the checkpoint.
    7525                 :             :      */
    7526                 :        1735 :     SpinLockAcquire(&XLogCtl->info_lck);
    7527                 :        1735 :     checkPoint.dataChecksumState = XLogCtl->data_checksum_version;
    7528                 :        1735 :     SpinLockRelease(&XLogCtl->info_lck);
    7529                 :             : 
    7530         [ +  + ]:        1735 :     if (shutdown)
    7531                 :             :     {
    7532                 :         736 :         XLogRecPtr  curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
    7533                 :             : 
    7534                 :             :         /*
    7535                 :             :          * Compute new REDO record ptr = location of next XLOG record.
    7536                 :             :          *
    7537                 :             :          * Since this is a shutdown checkpoint, there can't be any concurrent
    7538                 :             :          * WAL insertion.
    7539                 :             :          */
    7540         [ +  - ]:         736 :         freespace = INSERT_FREESPACE(curInsert);
    7541         [ -  + ]:         736 :         if (freespace == 0)
    7542                 :             :         {
    7543         [ #  # ]:           0 :             if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
    7544                 :           0 :                 curInsert += SizeOfXLogLongPHD;
    7545                 :             :             else
    7546                 :           0 :                 curInsert += SizeOfXLogShortPHD;
    7547                 :             :         }
    7548                 :         736 :         checkPoint.redo = curInsert;
    7549                 :             : 
    7550                 :             :         /*
    7551                 :             :          * Here we update the shared RedoRecPtr for future XLogInsert calls;
    7552                 :             :          * this must be done while holding all the insertion locks.
    7553                 :             :          *
    7554                 :             :          * Note: if we fail to complete the checkpoint, RedoRecPtr will be
    7555                 :             :          * left pointing past where it really needs to point.  This is okay;
    7556                 :             :          * the only consequence is that XLogInsert might back up whole buffers
    7557                 :             :          * that it didn't really need to.  We can't postpone advancing
    7558                 :             :          * RedoRecPtr because XLogInserts that happen while we are dumping
    7559                 :             :          * buffers must assume that their buffer changes are not included in
    7560                 :             :          * the checkpoint.
    7561                 :             :          */
    7562                 :         736 :         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    7563                 :             :     }
    7564                 :             : 
    7565                 :             :     /*
    7566                 :             :      * Now we can release the WAL insertion locks, allowing other xacts to
    7567                 :             :      * proceed while we are flushing disk buffers.
    7568                 :             :      */
    7569                 :        1735 :     WALInsertLockRelease();
    7570                 :             : 
    7571                 :             :     /*
    7572                 :             :      * If this is an online checkpoint, we have not yet determined the redo
    7573                 :             :      * point. We do so now by inserting the special XLOG_CHECKPOINT_REDO
    7574                 :             :      * record; the LSN at which it starts becomes the new redo pointer. We
    7575                 :             :      * don't do this for a shutdown checkpoint, because in that case no WAL
    7576                 :             :      * can be written between the redo point and the insertion of the
    7577                 :             :      * checkpoint record itself, so the checkpoint record itself serves to
    7578                 :             :      * mark the redo point.
    7579                 :             :      */
    7580         [ +  + ]:        1735 :     if (!shutdown)
    7581                 :             :     {
    7582                 :             :         xl_checkpoint_redo redo_rec;
    7583                 :             : 
    7584                 :         999 :         WALInsertLockAcquire();
    7585                 :         999 :         redo_rec.wal_level = wal_level;
    7586                 :         999 :         SpinLockAcquire(&XLogCtl->info_lck);
    7587                 :         999 :         redo_rec.data_checksum_version = XLogCtl->data_checksum_version;
    7588                 :         999 :         SpinLockRelease(&XLogCtl->info_lck);
    7589                 :         999 :         WALInsertLockRelease();
    7590                 :             : 
    7591                 :             :         /* Include WAL level in record for WAL summarizer's benefit. */
    7592                 :         999 :         XLogBeginInsert();
    7593                 :         999 :         XLogRegisterData(&redo_rec, sizeof(xl_checkpoint_redo));
    7594                 :         999 :         (void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
    7595                 :             : 
    7596                 :             :         /*
    7597                 :             :          * XLogInsertRecord will have updated XLogCtl->Insert.RedoRecPtr in
    7598                 :             :          * shared memory and RedoRecPtr in backend-local memory, but we need
    7599                 :             :          * to copy that into the record that will be inserted when the
    7600                 :             :          * checkpoint is complete.
    7601                 :             :          */
    7602                 :         999 :         checkPoint.redo = RedoRecPtr;
    7603                 :             :     }
    7604                 :             : 
    7605                 :             :     /* Update the info_lck-protected copy of RedoRecPtr as well */
    7606                 :        1735 :     SpinLockAcquire(&XLogCtl->info_lck);
    7607                 :        1735 :     XLogCtl->RedoRecPtr = checkPoint.redo;
    7608                 :        1735 :     SpinLockRelease(&XLogCtl->info_lck);
    7609                 :             : 
    7610                 :             :     /*
    7611                 :             :      * If enabled, log checkpoint start.  We postpone this until now so as not
    7612                 :             :      * to log anything if we decided to skip the checkpoint.
    7613                 :             :      */
    7614         [ +  + ]:        1735 :     if (log_checkpoints)
    7615                 :        1401 :         LogCheckpointStart(flags, false);
    7616                 :             : 
    7617                 :        1735 :     INJECTION_POINT_CACHED("create-checkpoint-run", NULL);
    7618                 :             : 
    7619                 :             :     /* Update the process title */
    7620                 :        1735 :     update_checkpoint_display(flags, false, false);
    7621                 :             : 
    7622                 :             :     TRACE_POSTGRESQL_CHECKPOINT_START(flags);
    7623                 :             : 
    7624                 :             :     /*
    7625                 :             :      * Get the other info we need for the checkpoint record.
    7626                 :             :      *
    7627                 :             :      * We don't need to save oldestClogXid in the checkpoint, it only matters
    7628                 :             :      * for the short period in which clog is being truncated, and if we crash
    7629                 :             :      * during that we'll redo the clog truncation and fix up oldestClogXid
    7630                 :             :      * there.
    7631                 :             :      */
    7632                 :        1735 :     LWLockAcquire(XidGenLock, LW_SHARED);
    7633                 :        1735 :     checkPoint.nextXid = TransamVariables->nextXid;
    7634                 :        1735 :     checkPoint.oldestXid = TransamVariables->oldestXid;
    7635                 :        1735 :     checkPoint.oldestXidDB = TransamVariables->oldestXidDB;
    7636                 :        1735 :     LWLockRelease(XidGenLock);
    7637                 :             : 
    7638                 :        1735 :     LWLockAcquire(CommitTsLock, LW_SHARED);
    7639                 :        1735 :     checkPoint.oldestCommitTsXid = TransamVariables->oldestCommitTsXid;
    7640                 :        1735 :     checkPoint.newestCommitTsXid = TransamVariables->newestCommitTsXid;
    7641                 :        1735 :     LWLockRelease(CommitTsLock);
    7642                 :             : 
    7643                 :        1735 :     LWLockAcquire(OidGenLock, LW_SHARED);
    7644                 :        1735 :     checkPoint.nextOid = TransamVariables->nextOid;
    7645         [ +  + ]:        1735 :     if (!shutdown)
    7646                 :         999 :         checkPoint.nextOid += TransamVariables->oidCount;
    7647                 :        1735 :     LWLockRelease(OidGenLock);
    7648                 :             : 
    7649                 :        1735 :     checkPoint.logicalDecodingEnabled = IsLogicalDecodingEnabled();
    7650                 :             : 
    7651                 :        1735 :     MultiXactGetCheckptMulti(shutdown,
    7652                 :             :                              &checkPoint.nextMulti,
    7653                 :             :                              &checkPoint.nextMultiOffset,
    7654                 :             :                              &checkPoint.oldestMulti,
    7655                 :             :                              &checkPoint.oldestMultiDB);
    7656                 :             : 
    7657                 :             :     /*
    7658                 :             :      * Having constructed the checkpoint record, ensure all shmem disk buffers
    7659                 :             :      * and commit-log buffers are flushed to disk.
    7660                 :             :      *
    7661                 :             :      * This I/O could fail for various reasons.  If so, we will fail to
    7662                 :             :      * complete the checkpoint, but there is no reason to force a system
    7663                 :             :      * panic. Accordingly, exit critical section while doing it.
    7664                 :             :      */
    7665                 :        1735 :     END_CRIT_SECTION();
    7666                 :             : 
    7667                 :             :     /*
    7668                 :             :      * In some cases there are groups of actions that must all occur on one
    7669                 :             :      * side or the other of a checkpoint record. Before flushing the
    7670                 :             :      * checkpoint record we must explicitly wait for any backend currently
    7671                 :             :      * performing those groups of actions.
    7672                 :             :      *
    7673                 :             :      * One example is end of transaction, so we must wait for any transactions
    7674                 :             :      * that are currently in commit critical sections.  If an xact inserted
    7675                 :             :      * its commit record into XLOG just before the REDO point, then a crash
    7676                 :             :      * restart from the REDO point would not replay that record, which means
    7677                 :             :      * that our flushing had better include the xact's update of pg_xact.  So
    7678                 :             :      * we wait till he's out of his commit critical section before proceeding.
    7679                 :             :      * See notes in RecordTransactionCommit().
    7680                 :             :      *
    7681                 :             :      * Because we've already released the insertion locks, this test is a bit
    7682                 :             :      * fuzzy: it is possible that we will wait for xacts we didn't really need
    7683                 :             :      * to wait for.  But the delay should be short and it seems better to make
    7684                 :             :      * checkpoint take a bit longer than to hold off insertions longer than
    7685                 :             :      * necessary. (In fact, the whole reason we have this issue is that xact.c
    7686                 :             :      * does commit record XLOG insertion and clog update as two separate steps
    7687                 :             :      * protected by different locks, but again that seems best on grounds of
    7688                 :             :      * minimizing lock contention.)
    7689                 :             :      *
    7690                 :             :      * A transaction that has not yet set delayChkptFlags when we look cannot
    7691                 :             :      * be at risk, since it has not inserted its commit record yet; and one
    7692                 :             :      * that's already cleared it is not at risk either, since it's done fixing
    7693                 :             :      * clog and we will correctly flush the update below.  So we cannot miss
    7694                 :             :      * any xacts we need to wait for.
    7695                 :             :      */
    7696                 :        1735 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
    7697         [ +  + ]:        1735 :     if (nvxids > 0)
    7698                 :             :     {
    7699                 :             :         do
    7700                 :             :         {
    7701                 :             :             /*
    7702                 :             :              * Keep absorbing fsync requests while we wait. There could even
    7703                 :             :              * be a deadlock if we don't, if the process that prevents the
    7704                 :             :              * checkpoint is trying to add a request to the queue.
    7705                 :             :              */
    7706                 :          41 :             AbsorbSyncRequests();
    7707                 :             : 
    7708                 :          41 :             pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_START);
    7709                 :          41 :             pg_usleep(10000L);  /* wait for 10 msec */
    7710                 :          41 :             pgstat_report_wait_end();
    7711         [ +  + ]:          41 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
    7712                 :             :                                               DELAY_CHKPT_START));
    7713                 :             :     }
    7714                 :        1735 :     pfree(vxids);
    7715                 :             : 
    7716                 :        1735 :     CheckPointGuts(checkPoint.redo, flags);
    7717                 :             : 
    7718                 :        1735 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
    7719         [ -  + ]:        1735 :     if (nvxids > 0)
    7720                 :             :     {
    7721                 :             :         do
    7722                 :             :         {
    7723                 :           0 :             AbsorbSyncRequests();
    7724                 :             : 
    7725                 :           0 :             pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_COMPLETE);
    7726                 :           0 :             pg_usleep(10000L);  /* wait for 10 msec */
    7727                 :           0 :             pgstat_report_wait_end();
    7728         [ #  # ]:           0 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
    7729                 :             :                                               DELAY_CHKPT_COMPLETE));
    7730                 :             :     }
    7731                 :        1735 :     pfree(vxids);
    7732                 :             : 
    7733                 :             :     /*
    7734                 :             :      * Take a snapshot of running transactions and write this to WAL. This
    7735                 :             :      * allows us to reconstruct the state of running transactions during
    7736                 :             :      * archive recovery, if required. Skip, if this info disabled.
    7737                 :             :      *
    7738                 :             :      * If we are shutting down, or Startup process is completing crash
    7739                 :             :      * recovery we don't need to write running xact data.
    7740                 :             :      */
    7741   [ +  +  +  + ]:        1735 :     if (!shutdown && XLogStandbyInfoActive())
    7742                 :         957 :         LogStandbySnapshot();
    7743                 :             : 
    7744                 :        1735 :     START_CRIT_SECTION();
    7745                 :             : 
    7746                 :             :     /*
    7747                 :             :      * Now insert the checkpoint record into XLOG.
    7748                 :             :      */
    7749                 :        1735 :     XLogBeginInsert();
    7750                 :        1735 :     XLogRegisterData(&checkPoint, sizeof(checkPoint));
    7751         [ +  + ]:        1735 :     recptr = XLogInsert(RM_XLOG_ID,
    7752                 :             :                         shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
    7753                 :             :                         XLOG_CHECKPOINT_ONLINE);
    7754                 :             : 
    7755                 :        1735 :     XLogFlush(recptr);
    7756                 :             : 
    7757                 :             :     /*
    7758                 :             :      * We mustn't write any new WAL after a shutdown checkpoint, or it will be
    7759                 :             :      * overwritten at next startup.  No-one should even try, this just allows
    7760                 :             :      * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
    7761                 :             :      * to just temporarily disable writing until the system has exited
    7762                 :             :      * recovery.
    7763                 :             :      */
    7764         [ +  + ]:        1735 :     if (shutdown)
    7765                 :             :     {
    7766         [ +  + ]:         736 :         if (flags & CHECKPOINT_END_OF_RECOVERY)
    7767                 :          30 :             LocalXLogInsertAllowed = oldXLogAllowed;
    7768                 :             :         else
    7769                 :         706 :             LocalXLogInsertAllowed = 0; /* never again write WAL */
    7770                 :             :     }
    7771                 :             : 
    7772                 :             :     /*
    7773                 :             :      * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
    7774                 :             :      * = end of actual checkpoint record.
    7775                 :             :      */
    7776   [ +  +  -  + ]:        1735 :     if (shutdown && checkPoint.redo != ProcLastRecPtr)
    7777         [ #  # ]:           0 :         ereport(PANIC,
    7778                 :             :                 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
    7779                 :             : 
    7780                 :             :     /*
    7781                 :             :      * Remember the prior checkpoint's redo ptr for
    7782                 :             :      * UpdateCheckPointDistanceEstimate()
    7783                 :             :      */
    7784                 :        1735 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    7785                 :             : 
    7786                 :             :     /*
    7787                 :             :      * Update the control file.
    7788                 :             :      */
    7789                 :        1735 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7790         [ +  + ]:        1735 :     if (shutdown)
    7791                 :         736 :         ControlFile->state = DB_SHUTDOWNED;
    7792                 :        1735 :     ControlFile->checkPoint = ProcLastRecPtr;
    7793                 :        1735 :     ControlFile->checkPointCopy = checkPoint;
    7794                 :             :     /* crash recovery should always recover to the end of WAL */
    7795                 :        1735 :     ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
    7796                 :        1735 :     ControlFile->minRecoveryPointTLI = 0;
    7797                 :             : 
    7798                 :             :     /*
    7799                 :             :      * Persist unloggedLSN value. It's reset on crash recovery, so this goes
    7800                 :             :      * unused on non-shutdown checkpoints, but seems useful to store it always
    7801                 :             :      * for debugging purposes.
    7802                 :             :      */
    7803                 :        1735 :     ControlFile->unloggedLSN = pg_atomic_read_membarrier_u64(&XLogCtl->unloggedLSN);
    7804                 :             : 
    7805                 :        1735 :     UpdateControlFile();
    7806                 :        1735 :     LWLockRelease(ControlFileLock);
    7807                 :             : 
    7808                 :             :     /*
    7809                 :             :      * We are now done with critical updates; no need for system panic if we
    7810                 :             :      * have trouble while fooling with old log segments.
    7811                 :             :      */
    7812                 :        1735 :     END_CRIT_SECTION();
    7813                 :             : 
    7814                 :             :     /*
    7815                 :             :      * WAL summaries end when the next XLOG_CHECKPOINT_REDO or
    7816                 :             :      * XLOG_CHECKPOINT_SHUTDOWN record is reached. This is the first point
    7817                 :             :      * where (a) we're not inside of a critical section and (b) we can be
    7818                 :             :      * certain that the relevant record has been flushed to disk, which must
    7819                 :             :      * happen before it can be summarized.
    7820                 :             :      *
    7821                 :             :      * If this is a shutdown checkpoint, then this happens reasonably
    7822                 :             :      * promptly: we've only just inserted and flushed the
    7823                 :             :      * XLOG_CHECKPOINT_SHUTDOWN record. If this is not a shutdown checkpoint,
    7824                 :             :      * then this might not be very prompt at all: the XLOG_CHECKPOINT_REDO
    7825                 :             :      * record was written before we began flushing data to disk, and that
    7826                 :             :      * could be many minutes ago at this point. However, we don't XLogFlush()
    7827                 :             :      * after inserting that record, so we're not guaranteed that it's on disk
    7828                 :             :      * until after the above call that flushes the XLOG_CHECKPOINT_ONLINE
    7829                 :             :      * record.
    7830                 :             :      */
    7831                 :        1735 :     WakeupWalSummarizer();
    7832                 :             : 
    7833                 :             :     /*
    7834                 :             :      * Let smgr do post-checkpoint cleanup (eg, deleting old files).
    7835                 :             :      */
    7836                 :        1735 :     SyncPostCheckpoint();
    7837                 :             : 
    7838                 :             :     /*
    7839                 :             :      * Update the average distance between checkpoints if the prior checkpoint
    7840                 :             :      * exists.
    7841                 :             :      */
    7842         [ +  - ]:        1735 :     if (XLogRecPtrIsValid(PriorRedoPtr))
    7843                 :        1735 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    7844                 :             : 
    7845                 :        1735 :     INJECTION_POINT("checkpoint-before-old-wal-removal", NULL);
    7846                 :             : 
    7847                 :             :     /*
    7848                 :             :      * Delete old log files, those no longer needed for last checkpoint to
    7849                 :             :      * prevent the disk holding the xlog from growing full.
    7850                 :             :      */
    7851                 :        1735 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    7852                 :        1735 :     KeepLogSeg(recptr, &_logSegNo);
    7853         [ +  + ]:        1735 :     if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED | RS_INVAL_IDLE_TIMEOUT,
    7854                 :             :                                            _logSegNo, InvalidOid,
    7855                 :             :                                            InvalidTransactionId))
    7856                 :             :     {
    7857                 :             :         /*
    7858                 :             :          * Some slots have been invalidated; recalculate the old-segment
    7859                 :             :          * horizon, starting again from RedoRecPtr.
    7860                 :             :          */
    7861                 :           4 :         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    7862                 :           4 :         KeepLogSeg(recptr, &_logSegNo);
    7863                 :             :     }
    7864                 :        1735 :     _logSegNo--;
    7865                 :        1735 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
    7866                 :             :                        checkPoint.ThisTimeLineID);
    7867                 :             : 
    7868                 :             :     /*
    7869                 :             :      * Make more log segments if needed.  (Do this after recycling old log
    7870                 :             :      * segments, since that may supply some of the needed files.)
    7871                 :             :      */
    7872         [ +  + ]:        1735 :     if (!shutdown)
    7873                 :         999 :         PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
    7874                 :             : 
    7875                 :             :     /*
    7876                 :             :      * Truncate pg_subtrans if possible.  We can throw away all data before
    7877                 :             :      * the oldest XMIN of any running transaction.  No future transaction will
    7878                 :             :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    7879                 :             :      * in subtrans.c).  During recovery, though, we mustn't do this because
    7880                 :             :      * StartupSUBTRANS hasn't been called yet.
    7881                 :             :      */
    7882         [ +  + ]:        1735 :     if (!RecoveryInProgress())
    7883                 :        1705 :         TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
    7884                 :             : 
    7885                 :             :     /* Real work is done; log and update stats. */
    7886                 :        1735 :     LogCheckpointEnd(false, flags);
    7887                 :             : 
    7888                 :             :     /* Reset the process title */
    7889                 :        1735 :     update_checkpoint_display(flags, false, true);
    7890                 :             : 
    7891                 :             :     TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
    7892                 :             :                                      NBuffers,
    7893                 :             :                                      CheckpointStats.ckpt_segs_added,
    7894                 :             :                                      CheckpointStats.ckpt_segs_removed,
    7895                 :             :                                      CheckpointStats.ckpt_segs_recycled);
    7896                 :             : 
    7897                 :        1735 :     return true;
    7898                 :             : }
    7899                 :             : 
    7900                 :             : /*
    7901                 :             :  * Mark the end of recovery in WAL though without running a full checkpoint.
    7902                 :             :  * We can expect that a restartpoint is likely to be in progress as we
    7903                 :             :  * do this, though we are unwilling to wait for it to complete.
    7904                 :             :  *
    7905                 :             :  * CreateRestartPoint() allows for the case where recovery may end before
    7906                 :             :  * the restartpoint completes so there is no concern of concurrent behaviour.
    7907                 :             :  */
    7908                 :             : static void
    7909                 :          52 : CreateEndOfRecoveryRecord(void)
    7910                 :             : {
    7911                 :             :     xl_end_of_recovery xlrec;
    7912                 :             :     XLogRecPtr  recptr;
    7913                 :             : 
    7914                 :             :     /* sanity check */
    7915         [ -  + ]:          52 :     if (!RecoveryInProgress())
    7916         [ #  # ]:           0 :         elog(ERROR, "can only be used to end recovery");
    7917                 :             : 
    7918                 :          52 :     xlrec.end_time = GetCurrentTimestamp();
    7919                 :          52 :     xlrec.wal_level = wal_level;
    7920                 :             : 
    7921                 :          52 :     WALInsertLockAcquireExclusive();
    7922                 :          52 :     xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
    7923                 :          52 :     xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    7924                 :          52 :     WALInsertLockRelease();
    7925                 :             : 
    7926                 :          52 :     START_CRIT_SECTION();
    7927                 :             : 
    7928                 :          52 :     XLogBeginInsert();
    7929                 :          52 :     XLogRegisterData(&xlrec, sizeof(xl_end_of_recovery));
    7930                 :          52 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
    7931                 :             : 
    7932                 :          52 :     XLogFlush(recptr);
    7933                 :             : 
    7934                 :             :     /*
    7935                 :             :      * Update the control file so that crash recovery can follow the timeline
    7936                 :             :      * changes to this point.
    7937                 :             :      */
    7938                 :          52 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7939                 :          52 :     ControlFile->minRecoveryPoint = recptr;
    7940                 :          52 :     ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
    7941                 :             : 
    7942                 :             :     /* start with the latest checksum version (as of the end of recovery) */
    7943                 :          52 :     SpinLockAcquire(&XLogCtl->info_lck);
    7944                 :          52 :     ControlFile->data_checksum_version = XLogCtl->data_checksum_version;
    7945                 :          52 :     SpinLockRelease(&XLogCtl->info_lck);
    7946                 :             : 
    7947                 :          52 :     UpdateControlFile();
    7948                 :          52 :     LWLockRelease(ControlFileLock);
    7949                 :             : 
    7950                 :          52 :     END_CRIT_SECTION();
    7951                 :          52 : }
    7952                 :             : 
    7953                 :             : /*
    7954                 :             :  * Write an OVERWRITE_CONTRECORD message.
    7955                 :             :  *
    7956                 :             :  * When on WAL replay we expect a continuation record at the start of a page
    7957                 :             :  * that is not there, recovery ends and WAL writing resumes at that point.
    7958                 :             :  * But it's wrong to resume writing new WAL back at the start of the record
    7959                 :             :  * that was broken, because downstream consumers of that WAL (physical
    7960                 :             :  * replicas) are not prepared to "rewind".  So the first action after
    7961                 :             :  * finishing replay of all valid WAL must be to write a record of this type
    7962                 :             :  * at the point where the contrecord was missing; to support xlogreader
    7963                 :             :  * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
    7964                 :             :  * to the page header where the record occurs.  xlogreader has an ad-hoc
    7965                 :             :  * mechanism to report metadata about the broken record, which is what we
    7966                 :             :  * use here.
    7967                 :             :  *
    7968                 :             :  * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
    7969                 :             :  * skip the record it was reading, and pass back the LSN of the skipped
    7970                 :             :  * record, so that its caller can verify (on "replay" of that record) that the
    7971                 :             :  * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
    7972                 :             :  *
    7973                 :             :  * 'aborted_lsn' is the beginning position of the record that was incomplete.
    7974                 :             :  * It is included in the WAL record.  'pagePtr' and 'newTLI' point to the
    7975                 :             :  * beginning of the XLOG page where the record is to be inserted.  They must
    7976                 :             :  * match the current WAL insert position, they're passed here just so that we
    7977                 :             :  * can verify that.
    7978                 :             :  */
    7979                 :             : static XLogRecPtr
    7980                 :          11 : CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
    7981                 :             :                                 TimeLineID newTLI)
    7982                 :             : {
    7983                 :             :     xl_overwrite_contrecord xlrec;
    7984                 :             :     XLogRecPtr  recptr;
    7985                 :             :     XLogPageHeader pagehdr;
    7986                 :             :     XLogRecPtr  startPos;
    7987                 :             : 
    7988                 :             :     /* sanity checks */
    7989         [ -  + ]:          11 :     if (!RecoveryInProgress())
    7990         [ #  # ]:           0 :         elog(ERROR, "can only be used at end of recovery");
    7991         [ -  + ]:          11 :     if (pagePtr % XLOG_BLCKSZ != 0)
    7992         [ #  # ]:           0 :         elog(ERROR, "invalid position for missing continuation record %X/%08X",
    7993                 :             :              LSN_FORMAT_ARGS(pagePtr));
    7994                 :             : 
    7995                 :             :     /* The current WAL insert position should be right after the page header */
    7996                 :          11 :     startPos = pagePtr;
    7997         [ +  + ]:          11 :     if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
    7998                 :           1 :         startPos += SizeOfXLogLongPHD;
    7999                 :             :     else
    8000                 :          10 :         startPos += SizeOfXLogShortPHD;
    8001                 :          11 :     recptr = GetXLogInsertRecPtr();
    8002         [ -  + ]:          11 :     if (recptr != startPos)
    8003         [ #  # ]:           0 :         elog(ERROR, "invalid WAL insert position %X/%08X for OVERWRITE_CONTRECORD",
    8004                 :             :              LSN_FORMAT_ARGS(recptr));
    8005                 :             : 
    8006                 :          11 :     START_CRIT_SECTION();
    8007                 :             : 
    8008                 :             :     /*
    8009                 :             :      * Initialize the XLOG page header (by GetXLogBuffer), and set the
    8010                 :             :      * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
    8011                 :             :      *
    8012                 :             :      * No other backend is allowed to write WAL yet, so acquiring the WAL
    8013                 :             :      * insertion lock is just pro forma.
    8014                 :             :      */
    8015                 :          11 :     WALInsertLockAcquire();
    8016                 :          11 :     pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
    8017                 :          11 :     pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
    8018                 :          11 :     WALInsertLockRelease();
    8019                 :             : 
    8020                 :             :     /*
    8021                 :             :      * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
    8022                 :             :      * page.  We know it becomes the first record, because no other backend is
    8023                 :             :      * allowed to write WAL yet.
    8024                 :             :      */
    8025                 :          11 :     XLogBeginInsert();
    8026                 :          11 :     xlrec.overwritten_lsn = aborted_lsn;
    8027                 :          11 :     xlrec.overwrite_time = GetCurrentTimestamp();
    8028                 :          11 :     XLogRegisterData(&xlrec, sizeof(xl_overwrite_contrecord));
    8029                 :          11 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
    8030                 :             : 
    8031                 :             :     /* check that the record was inserted to the right place */
    8032         [ -  + ]:          11 :     if (ProcLastRecPtr != startPos)
    8033         [ #  # ]:           0 :         elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%08X",
    8034                 :             :              LSN_FORMAT_ARGS(ProcLastRecPtr));
    8035                 :             : 
    8036                 :          11 :     XLogFlush(recptr);
    8037                 :             : 
    8038                 :          11 :     END_CRIT_SECTION();
    8039                 :             : 
    8040                 :          11 :     return recptr;
    8041                 :             : }
    8042                 :             : 
    8043                 :             : /*
    8044                 :             :  * Flush all data in shared memory to disk, and fsync
    8045                 :             :  *
    8046                 :             :  * This is the common code shared between regular checkpoints and
    8047                 :             :  * recovery restartpoints.
    8048                 :             :  */
    8049                 :             : static void
    8050                 :        1944 : CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
    8051                 :             : {
    8052                 :        1944 :     CheckPointRelationMap();
    8053                 :        1944 :     CheckPointReplicationSlots(flags & CHECKPOINT_IS_SHUTDOWN);
    8054                 :        1944 :     CheckPointSnapBuild();
    8055                 :        1944 :     CheckPointLogicalRewriteHeap();
    8056                 :        1944 :     CheckPointReplicationOrigin();
    8057                 :             : 
    8058                 :             :     /* Write out all dirty data in SLRUs and the main buffer pool */
    8059                 :             :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
    8060                 :        1944 :     CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
    8061                 :        1944 :     CheckPointCLOG();
    8062                 :        1944 :     CheckPointCommitTs();
    8063                 :        1944 :     CheckPointSUBTRANS();
    8064                 :        1944 :     CheckPointMultiXact();
    8065                 :        1944 :     CheckPointPredicate();
    8066                 :        1944 :     CheckPointBuffers(flags);
    8067                 :             : 
    8068                 :             :     /* Perform all queued up fsyncs */
    8069                 :             :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
    8070                 :        1944 :     CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
    8071                 :        1944 :     ProcessSyncRequests();
    8072                 :        1944 :     CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
    8073                 :             :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
    8074                 :             : 
    8075                 :             :     /* We deliberately delay 2PC checkpointing as long as possible */
    8076                 :        1944 :     CheckPointTwoPhase(checkPointRedo);
    8077                 :        1944 : }
    8078                 :             : 
    8079                 :             : /*
    8080                 :             :  * Save a checkpoint for recovery restart if appropriate
    8081                 :             :  *
    8082                 :             :  * This function is called each time a checkpoint record is read from XLOG.
    8083                 :             :  * It must determine whether the checkpoint represents a safe restartpoint or
    8084                 :             :  * not.  If so, the checkpoint record is stashed in shared memory so that
    8085                 :             :  * CreateRestartPoint can consult it.  (Note that the latter function is
    8086                 :             :  * executed by the checkpointer, while this one will be executed by the
    8087                 :             :  * startup process.)
    8088                 :             :  */
    8089                 :             : static void
    8090                 :         755 : RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
    8091                 :             : {
    8092                 :             :     /*
    8093                 :             :      * Also refrain from creating a restartpoint if we have seen any
    8094                 :             :      * references to non-existent pages. Restarting recovery from the
    8095                 :             :      * restartpoint would not see the references, so we would lose the
    8096                 :             :      * cross-check that the pages belonged to a relation that was dropped
    8097                 :             :      * later.
    8098                 :             :      */
    8099         [ -  + ]:         755 :     if (XLogHaveInvalidPages())
    8100                 :             :     {
    8101         [ #  # ]:           0 :         elog(DEBUG2,
    8102                 :             :              "could not record restart point at %X/%08X because there are unresolved references to invalid pages",
    8103                 :             :              LSN_FORMAT_ARGS(checkPoint->redo));
    8104                 :           0 :         return;
    8105                 :             :     }
    8106                 :             : 
    8107                 :             :     /*
    8108                 :             :      * Copy the checkpoint record to shared memory, so that checkpointer can
    8109                 :             :      * work out the next time it wants to perform a restartpoint.
    8110                 :             :      */
    8111                 :         755 :     SpinLockAcquire(&XLogCtl->info_lck);
    8112                 :         755 :     XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
    8113                 :         755 :     XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
    8114                 :         755 :     XLogCtl->lastCheckPoint = *checkPoint;
    8115                 :         755 :     SpinLockRelease(&XLogCtl->info_lck);
    8116                 :             : }
    8117                 :             : 
    8118                 :             : /*
    8119                 :             :  * Establish a restartpoint if possible.
    8120                 :             :  *
    8121                 :             :  * This is similar to CreateCheckPoint, but is used during WAL recovery
    8122                 :             :  * to establish a point from which recovery can roll forward without
    8123                 :             :  * replaying the entire recovery log.
    8124                 :             :  *
    8125                 :             :  * Returns true if a new restartpoint was established. We can only establish
    8126                 :             :  * a restartpoint if we have replayed a safe checkpoint record since last
    8127                 :             :  * restartpoint.
    8128                 :             :  */
    8129                 :             : bool
    8130                 :         626 : CreateRestartPoint(int flags)
    8131                 :             : {
    8132                 :             :     XLogRecPtr  lastCheckPointRecPtr;
    8133                 :             :     XLogRecPtr  lastCheckPointEndPtr;
    8134                 :             :     CheckPoint  lastCheckPoint;
    8135                 :             :     XLogRecPtr  PriorRedoPtr;
    8136                 :             :     XLogRecPtr  receivePtr;
    8137                 :             :     XLogRecPtr  replayPtr;
    8138                 :             :     TimeLineID  replayTLI;
    8139                 :             :     XLogRecPtr  endptr;
    8140                 :             :     XLogSegNo   _logSegNo;
    8141                 :             :     TimestampTz xtime;
    8142                 :             : 
    8143                 :             :     /* Concurrent checkpoint/restartpoint cannot happen */
    8144                 :             :     Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
    8145                 :             : 
    8146                 :             :     /* Get a local copy of the last safe checkpoint record. */
    8147                 :         626 :     SpinLockAcquire(&XLogCtl->info_lck);
    8148                 :         626 :     lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
    8149                 :         626 :     lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
    8150                 :         626 :     lastCheckPoint = XLogCtl->lastCheckPoint;
    8151                 :         626 :     SpinLockRelease(&XLogCtl->info_lck);
    8152                 :             : 
    8153                 :             :     /*
    8154                 :             :      * Check that we're still in recovery mode. It's ok if we exit recovery
    8155                 :             :      * mode after this check, the restart point is valid anyway.
    8156                 :             :      */
    8157         [ -  + ]:         626 :     if (!RecoveryInProgress())
    8158                 :             :     {
    8159         [ #  # ]:           0 :         ereport(DEBUG2,
    8160                 :             :                 (errmsg_internal("skipping restartpoint, recovery has already ended")));
    8161                 :           0 :         return false;
    8162                 :             :     }
    8163                 :             : 
    8164                 :             :     /*
    8165                 :             :      * If the last checkpoint record we've replayed is already our last
    8166                 :             :      * restartpoint, we can't perform a new restart point. We still update
    8167                 :             :      * minRecoveryPoint in that case, so that if this is a shutdown restart
    8168                 :             :      * point, we won't start up earlier than before. That's not strictly
    8169                 :             :      * necessary, but when hot standby is enabled, it would be rather weird if
    8170                 :             :      * the database opened up for read-only connections at a point-in-time
    8171                 :             :      * before the last shutdown. Such time travel is still possible in case of
    8172                 :             :      * immediate shutdown, though.
    8173                 :             :      *
    8174                 :             :      * We don't explicitly advance minRecoveryPoint when we do create a
    8175                 :             :      * restartpoint. It's assumed that flushing the buffers will do that as a
    8176                 :             :      * side-effect.
    8177                 :             :      */
    8178         [ +  + ]:         626 :     if (!XLogRecPtrIsValid(lastCheckPointRecPtr) ||
    8179         [ +  + ]:         295 :         lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
    8180                 :             :     {
    8181         [ -  + ]:         417 :         ereport(DEBUG2,
    8182                 :             :                 errmsg_internal("skipping restartpoint, already performed at %X/%08X",
    8183                 :             :                                 LSN_FORMAT_ARGS(lastCheckPoint.redo)));
    8184                 :             : 
    8185                 :         417 :         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    8186         [ +  + ]:         417 :         if (flags & CHECKPOINT_IS_SHUTDOWN)
    8187                 :             :         {
    8188                 :          39 :             LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8189                 :          39 :             ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    8190                 :          39 :             UpdateControlFile();
    8191                 :          39 :             LWLockRelease(ControlFileLock);
    8192                 :             :         }
    8193                 :         417 :         return false;
    8194                 :             :     }
    8195                 :             : 
    8196                 :             :     /*
    8197                 :             :      * Update the shared RedoRecPtr so that the startup process can calculate
    8198                 :             :      * the number of segments replayed since last restartpoint, and request a
    8199                 :             :      * restartpoint if it exceeds CheckPointSegments.
    8200                 :             :      *
    8201                 :             :      * Like in CreateCheckPoint(), hold off insertions to update it, although
    8202                 :             :      * during recovery this is just pro forma, because no WAL insertions are
    8203                 :             :      * happening.
    8204                 :             :      */
    8205                 :         209 :     WALInsertLockAcquireExclusive();
    8206                 :         209 :     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
    8207                 :         209 :     WALInsertLockRelease();
    8208                 :             : 
    8209                 :             :     /* Also update the info_lck-protected copy */
    8210                 :         209 :     SpinLockAcquire(&XLogCtl->info_lck);
    8211                 :         209 :     XLogCtl->RedoRecPtr = lastCheckPoint.redo;
    8212                 :         209 :     SpinLockRelease(&XLogCtl->info_lck);
    8213                 :             : 
    8214                 :             :     /*
    8215                 :             :      * Prepare to accumulate statistics.
    8216                 :             :      *
    8217                 :             :      * Note: because it is possible for log_checkpoints to change while a
    8218                 :             :      * checkpoint proceeds, we always accumulate stats, even if
    8219                 :             :      * log_checkpoints is currently off.
    8220                 :             :      */
    8221   [ +  -  +  -  :        2299 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
          +  -  +  -  +  
                      + ]
    8222                 :         209 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    8223                 :             : 
    8224         [ +  - ]:         209 :     if (log_checkpoints)
    8225                 :         209 :         LogCheckpointStart(flags, true);
    8226                 :             : 
    8227                 :             :     /* Update the process title */
    8228                 :         209 :     update_checkpoint_display(flags, true, false);
    8229                 :             : 
    8230                 :         209 :     CheckPointGuts(lastCheckPoint.redo, flags);
    8231                 :             : 
    8232                 :             :     /*
    8233                 :             :      * This location needs to be after CheckPointGuts() to ensure that some
    8234                 :             :      * work has already happened during this checkpoint.
    8235                 :             :      */
    8236                 :         209 :     INJECTION_POINT("create-restart-point", NULL);
    8237                 :             : 
    8238                 :             :     /*
    8239                 :             :      * Remember the prior checkpoint's redo ptr for
    8240                 :             :      * UpdateCheckPointDistanceEstimate()
    8241                 :             :      */
    8242                 :         209 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    8243                 :             : 
    8244                 :             :     /*
    8245                 :             :      * Update pg_control, using current time.  Check that it still shows an
    8246                 :             :      * older checkpoint, else do nothing; this is a quick hack to make sure
    8247                 :             :      * nothing really bad happens if somehow we get here after the
    8248                 :             :      * end-of-recovery checkpoint.
    8249                 :             :      */
    8250                 :         209 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8251         [ +  - ]:         209 :     if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
    8252                 :             :     {
    8253                 :             :         /*
    8254                 :             :          * Update the checkpoint information.  We do this even if the cluster
    8255                 :             :          * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL
    8256                 :             :          * segments recycled below.
    8257                 :             :          */
    8258                 :         209 :         ControlFile->checkPoint = lastCheckPointRecPtr;
    8259                 :         209 :         ControlFile->checkPointCopy = lastCheckPoint;
    8260                 :             : 
    8261                 :             :         /*
    8262                 :             :          * Ensure minRecoveryPoint is past the checkpoint record and update it
    8263                 :             :          * if the control file still shows DB_IN_ARCHIVE_RECOVERY.  Normally,
    8264                 :             :          * this will have happened already while writing out dirty buffers,
    8265                 :             :          * but not necessarily - e.g. because no buffers were dirtied.  We do
    8266                 :             :          * this because a backup performed in recovery uses minRecoveryPoint
    8267                 :             :          * to determine which WAL files must be included in the backup, and
    8268                 :             :          * the file (or files) containing the checkpoint record must be
    8269                 :             :          * included, at a minimum.  Note that for an ordinary restart of
    8270                 :             :          * recovery there's no value in having the minimum recovery point any
    8271                 :             :          * earlier than this anyway, because redo will begin just after the
    8272                 :             :          * checkpoint record.
    8273                 :             :          */
    8274         [ +  + ]:         209 :         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
    8275                 :             :         {
    8276         [ +  + ]:         208 :             if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
    8277                 :             :             {
    8278                 :          19 :                 ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
    8279                 :          19 :                 ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
    8280                 :             : 
    8281                 :             :                 /* update local copy */
    8282                 :          19 :                 LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    8283                 :          19 :                 LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    8284                 :             :             }
    8285         [ +  + ]:         208 :             if (flags & CHECKPOINT_IS_SHUTDOWN)
    8286                 :          24 :                 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    8287                 :             :         }
    8288                 :             : 
    8289                 :             :         /* we shall start with the latest checksum version */
    8290                 :         209 :         ControlFile->data_checksum_version = lastCheckPoint.dataChecksumState;
    8291                 :             : 
    8292                 :         209 :         UpdateControlFile();
    8293                 :             :     }
    8294                 :         209 :     LWLockRelease(ControlFileLock);
    8295                 :             : 
    8296                 :             :     /*
    8297                 :             :      * Update the average distance between checkpoints/restartpoints if the
    8298                 :             :      * prior checkpoint exists.
    8299                 :             :      */
    8300         [ +  - ]:         209 :     if (XLogRecPtrIsValid(PriorRedoPtr))
    8301                 :         209 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    8302                 :             : 
    8303                 :             :     /*
    8304                 :             :      * Delete old log files, those no longer needed for last restartpoint to
    8305                 :             :      * prevent the disk holding the xlog from growing full.
    8306                 :             :      */
    8307                 :         209 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    8308                 :             : 
    8309                 :             :     /*
    8310                 :             :      * Retreat _logSegNo using the current end of xlog replayed or received,
    8311                 :             :      * whichever is later.
    8312                 :             :      */
    8313                 :         209 :     receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
    8314                 :         209 :     replayPtr = GetXLogReplayRecPtr(&replayTLI);
    8315                 :         209 :     endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
    8316                 :         209 :     KeepLogSeg(endptr, &_logSegNo);
    8317                 :             : 
    8318                 :         209 :     INJECTION_POINT("restartpoint-before-slot-invalidation", NULL);
    8319                 :             : 
    8320         [ +  + ]:         209 :     if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED | RS_INVAL_IDLE_TIMEOUT,
    8321                 :             :                                            _logSegNo, InvalidOid,
    8322                 :             :                                            InvalidTransactionId))
    8323                 :             :     {
    8324                 :             :         /*
    8325                 :             :          * Some slots have been invalidated; recalculate the old-segment
    8326                 :             :          * horizon, starting again from RedoRecPtr.
    8327                 :             :          */
    8328                 :           1 :         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    8329                 :           1 :         KeepLogSeg(endptr, &_logSegNo);
    8330                 :             :     }
    8331                 :         209 :     _logSegNo--;
    8332                 :             : 
    8333                 :             :     /*
    8334                 :             :      * Try to recycle segments on a useful timeline. If we've been promoted
    8335                 :             :      * since the beginning of this restartpoint, use the new timeline chosen
    8336                 :             :      * at end of recovery.  If we're still in recovery, use the timeline we're
    8337                 :             :      * currently replaying.
    8338                 :             :      *
    8339                 :             :      * There is no guarantee that the WAL segments will be useful on the
    8340                 :             :      * current timeline; if recovery proceeds to a new timeline right after
    8341                 :             :      * this, the pre-allocated WAL segments on this timeline will not be used,
    8342                 :             :      * and will go wasted until recycled on the next restartpoint. We'll live
    8343                 :             :      * with that.
    8344                 :             :      */
    8345         [ +  + ]:         209 :     if (!RecoveryInProgress())
    8346                 :           1 :         replayTLI = XLogCtl->InsertTimeLineID;
    8347                 :             : 
    8348                 :         209 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
    8349                 :             : 
    8350                 :             :     /*
    8351                 :             :      * Make more log segments if needed.  (Do this after recycling old log
    8352                 :             :      * segments, since that may supply some of the needed files.)
    8353                 :             :      */
    8354                 :         209 :     PreallocXlogFiles(endptr, replayTLI);
    8355                 :             : 
    8356                 :             :     /*
    8357                 :             :      * Truncate pg_subtrans if possible.  We can throw away all data before
    8358                 :             :      * the oldest XMIN of any running transaction.  No future transaction will
    8359                 :             :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    8360                 :             :      * in subtrans.c).  When hot standby is disabled, though, we mustn't do
    8361                 :             :      * this because StartupSUBTRANS hasn't been called yet.
    8362                 :             :      */
    8363         [ +  - ]:         209 :     if (EnableHotStandby)
    8364                 :         209 :         TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
    8365                 :             : 
    8366                 :             :     /* Real work is done; log and update stats. */
    8367                 :         209 :     LogCheckpointEnd(true, flags);
    8368                 :             : 
    8369                 :             :     /* Reset the process title */
    8370                 :         209 :     update_checkpoint_display(flags, true, true);
    8371                 :             : 
    8372                 :         209 :     xtime = GetLatestXTime();
    8373   [ +  -  +  -  :         209 :     ereport((log_checkpoints ? LOG : DEBUG2),
                   +  + ]
    8374                 :             :             errmsg("recovery restart point at %X/%08X",
    8375                 :             :                    LSN_FORMAT_ARGS(lastCheckPoint.redo)),
    8376                 :             :             xtime ? errdetail("Last completed transaction was at log time %s.",
    8377                 :             :                               timestamptz_to_str(xtime)) : 0);
    8378                 :             : 
    8379                 :             :     /*
    8380                 :             :      * Finally, execute archive_cleanup_command, if any.
    8381                 :             :      */
    8382   [ +  -  -  + ]:         209 :     if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
    8383                 :           0 :         ExecuteRecoveryCommand(archiveCleanupCommand,
    8384                 :             :                                "archive_cleanup_command",
    8385                 :             :                                false,
    8386                 :             :                                WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
    8387                 :             : 
    8388                 :         209 :     return true;
    8389                 :             : }
    8390                 :             : 
    8391                 :             : /*
    8392                 :             :  * Report availability of WAL for the given target LSN
    8393                 :             :  *      (typically a slot's restart_lsn)
    8394                 :             :  *
    8395                 :             :  * Returns one of the following enum values:
    8396                 :             :  *
    8397                 :             :  * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
    8398                 :             :  *   max_wal_size.
    8399                 :             :  *
    8400                 :             :  * * WALAVAIL_EXTENDED means it is still available by preserving extra
    8401                 :             :  *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
    8402                 :             :  *   than max_wal_size, this state is not returned.
    8403                 :             :  *
    8404                 :             :  * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
    8405                 :             :  *   remove reserved segments. The walsender using this slot may return to the
    8406                 :             :  *   above.
    8407                 :             :  *
    8408                 :             :  * * WALAVAIL_REMOVED means it has been removed. A replication stream on
    8409                 :             :  *   a slot with this LSN cannot continue.  (Any associated walsender
    8410                 :             :  *   processes should have been terminated already.)
    8411                 :             :  *
    8412                 :             :  * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
    8413                 :             :  */
    8414                 :             : WALAvailability
    8415                 :         635 : GetWALAvailability(XLogRecPtr targetLSN)
    8416                 :             : {
    8417                 :             :     XLogRecPtr  currpos;        /* current write LSN */
    8418                 :             :     XLogSegNo   currSeg;        /* segid of currpos */
    8419                 :             :     XLogSegNo   targetSeg;      /* segid of targetLSN */
    8420                 :             :     XLogSegNo   oldestSeg;      /* actual oldest segid */
    8421                 :             :     XLogSegNo   oldestSegMaxWalSize;    /* oldest segid kept by max_wal_size */
    8422                 :             :     XLogSegNo   oldestSlotSeg;  /* oldest segid kept by slot */
    8423                 :             :     uint64      keepSegs;
    8424                 :             : 
    8425                 :             :     /*
    8426                 :             :      * slot does not reserve WAL. Either deactivated, or has never been active
    8427                 :             :      */
    8428         [ +  + ]:         635 :     if (!XLogRecPtrIsValid(targetLSN))
    8429                 :          29 :         return WALAVAIL_INVALID_LSN;
    8430                 :             : 
    8431                 :             :     /*
    8432                 :             :      * Calculate the oldest segment currently reserved by all slots,
    8433                 :             :      * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
    8434                 :             :      * oldestSlotSeg to the current segment.
    8435                 :             :      */
    8436                 :         606 :     currpos = GetXLogWriteRecPtr();
    8437                 :         606 :     XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
    8438                 :         606 :     KeepLogSeg(currpos, &oldestSlotSeg);
    8439                 :             : 
    8440                 :             :     /*
    8441                 :             :      * Find the oldest extant segment file. We get 1 until checkpoint removes
    8442                 :             :      * the first WAL segment file since startup, which causes the status being
    8443                 :             :      * wrong under certain abnormal conditions but that doesn't actually harm.
    8444                 :             :      */
    8445                 :         606 :     oldestSeg = XLogGetLastRemovedSegno() + 1;
    8446                 :             : 
    8447                 :             :     /* calculate oldest segment by max_wal_size */
    8448                 :         606 :     XLByteToSeg(currpos, currSeg, wal_segment_size);
    8449                 :         606 :     keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
    8450                 :             : 
    8451         [ +  + ]:         606 :     if (currSeg > keepSegs)
    8452                 :          13 :         oldestSegMaxWalSize = currSeg - keepSegs;
    8453                 :             :     else
    8454                 :         593 :         oldestSegMaxWalSize = 1;
    8455                 :             : 
    8456                 :             :     /* the segment we care about */
    8457                 :         606 :     XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
    8458                 :             : 
    8459                 :             :     /*
    8460                 :             :      * No point in returning reserved or extended status values if the
    8461                 :             :      * targetSeg is known to be lost.
    8462                 :             :      */
    8463         [ +  + ]:         606 :     if (targetSeg >= oldestSlotSeg)
    8464                 :             :     {
    8465                 :             :         /* show "reserved" when targetSeg is within max_wal_size */
    8466         [ +  + ]:         605 :         if (targetSeg >= oldestSegMaxWalSize)
    8467                 :         603 :             return WALAVAIL_RESERVED;
    8468                 :             : 
    8469                 :             :         /* being retained by slots exceeding max_wal_size */
    8470                 :           2 :         return WALAVAIL_EXTENDED;
    8471                 :             :     }
    8472                 :             : 
    8473                 :             :     /* WAL segments are no longer retained but haven't been removed yet */
    8474         [ +  - ]:           1 :     if (targetSeg >= oldestSeg)
    8475                 :           1 :         return WALAVAIL_UNRESERVED;
    8476                 :             : 
    8477                 :             :     /* Definitely lost */
    8478                 :           0 :     return WALAVAIL_REMOVED;
    8479                 :             : }
    8480                 :             : 
    8481                 :             : 
    8482                 :             : /*
    8483                 :             :  * Retreat *logSegNo to the last segment that we need to retain because of
    8484                 :             :  * either wal_keep_size or replication slots.
    8485                 :             :  *
    8486                 :             :  * This is calculated by subtracting wal_keep_size from the given xlog
    8487                 :             :  * location, recptr and by making sure that that result is below the
    8488                 :             :  * requirement of replication slots.  For the latter criterion we do consider
    8489                 :             :  * the effects of max_slot_wal_keep_size: reserve at most that much space back
    8490                 :             :  * from recptr.
    8491                 :             :  *
    8492                 :             :  * Note about replication slots: if this function calculates a value
    8493                 :             :  * that's further ahead than what slots need reserved, then affected
    8494                 :             :  * slots need to be invalidated and this function invoked again.
    8495                 :             :  * XXX it might be a good idea to rewrite this function so that
    8496                 :             :  * invalidation is optionally done here, instead.
    8497                 :             :  */
    8498                 :             : static void
    8499                 :        2555 : KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
    8500                 :             : {
    8501                 :             :     XLogSegNo   currSegNo;
    8502                 :             :     XLogSegNo   segno;
    8503                 :             :     XLogRecPtr  keep;
    8504                 :             : 
    8505                 :        2555 :     XLByteToSeg(recptr, currSegNo, wal_segment_size);
    8506                 :        2555 :     segno = currSegNo;
    8507                 :             : 
    8508                 :             :     /* Calculate how many segments are kept by slots. */
    8509                 :        2555 :     keep = XLogGetReplicationSlotMinimumLSN();
    8510   [ +  +  +  + ]:        2555 :     if (XLogRecPtrIsValid(keep) && keep < recptr)
    8511                 :             :     {
    8512                 :         768 :         XLByteToSeg(keep, segno, wal_segment_size);
    8513                 :             : 
    8514                 :             :         /*
    8515                 :             :          * Account for max_slot_wal_keep_size to avoid keeping more than
    8516                 :             :          * configured.  However, don't do that during a binary upgrade: if
    8517                 :             :          * slots were to be invalidated because of this, it would not be
    8518                 :             :          * possible to preserve logical ones during the upgrade.
    8519                 :             :          */
    8520   [ +  +  +  - ]:         768 :         if (max_slot_wal_keep_size_mb >= 0 && !IsBinaryUpgrade)
    8521                 :             :         {
    8522                 :             :             uint64      slot_keep_segs;
    8523                 :             : 
    8524                 :          26 :             slot_keep_segs =
    8525                 :          26 :                 ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
    8526                 :             : 
    8527         [ +  + ]:          26 :             if (currSegNo - segno > slot_keep_segs)
    8528                 :           7 :                 segno = currSegNo - slot_keep_segs;
    8529                 :             :         }
    8530                 :             :     }
    8531                 :             : 
    8532                 :             :     /*
    8533                 :             :      * If WAL summarization is in use, don't remove WAL that has yet to be
    8534                 :             :      * summarized.
    8535                 :             :      */
    8536                 :        2555 :     keep = GetOldestUnsummarizedLSN(NULL, NULL);
    8537         [ +  + ]:        2555 :     if (XLogRecPtrIsValid(keep))
    8538                 :             :     {
    8539                 :             :         XLogSegNo   unsummarized_segno;
    8540                 :             : 
    8541                 :           7 :         XLByteToSeg(keep, unsummarized_segno, wal_segment_size);
    8542         [ +  - ]:           7 :         if (unsummarized_segno < segno)
    8543                 :           7 :             segno = unsummarized_segno;
    8544                 :             :     }
    8545                 :             : 
    8546                 :             :     /* but, keep at least wal_keep_size if that's set */
    8547         [ +  + ]:        2555 :     if (wal_keep_size_mb > 0)
    8548                 :             :     {
    8549                 :             :         uint64      keep_segs;
    8550                 :             : 
    8551                 :          74 :         keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
    8552         [ +  - ]:          74 :         if (currSegNo - segno < keep_segs)
    8553                 :             :         {
    8554                 :             :             /* avoid underflow, don't go below 1 */
    8555         [ +  + ]:          74 :             if (currSegNo <= keep_segs)
    8556                 :          70 :                 segno = 1;
    8557                 :             :             else
    8558                 :           4 :                 segno = currSegNo - keep_segs;
    8559                 :             :         }
    8560                 :             :     }
    8561                 :             : 
    8562                 :             :     /* don't delete WAL segments newer than the calculated segment */
    8563         [ +  + ]:        2555 :     if (segno < *logSegNo)
    8564                 :         362 :         *logSegNo = segno;
    8565                 :        2555 : }
    8566                 :             : 
    8567                 :             : /*
    8568                 :             :  * Write a NEXTOID log record
    8569                 :             :  */
    8570                 :             : void
    8571                 :         692 : XLogPutNextOid(Oid nextOid)
    8572                 :             : {
    8573                 :         692 :     XLogBeginInsert();
    8574                 :         692 :     XLogRegisterData(&nextOid, sizeof(Oid));
    8575                 :         692 :     (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
    8576                 :             : 
    8577                 :             :     /*
    8578                 :             :      * We need not flush the NEXTOID record immediately, because any of the
    8579                 :             :      * just-allocated OIDs could only reach disk as part of a tuple insert or
    8580                 :             :      * update that would have its own XLOG record that must follow the NEXTOID
    8581                 :             :      * record.  Therefore, the standard buffer LSN interlock applied to those
    8582                 :             :      * records will ensure no such OID reaches disk before the NEXTOID record
    8583                 :             :      * does.
    8584                 :             :      *
    8585                 :             :      * Note, however, that the above statement only covers state "within" the
    8586                 :             :      * database.  When we use a generated OID as a file or directory name, we
    8587                 :             :      * are in a sense violating the basic WAL rule, because that filesystem
    8588                 :             :      * change may reach disk before the NEXTOID WAL record does.  The impact
    8589                 :             :      * of this is that if a database crash occurs immediately afterward, we
    8590                 :             :      * might after restart re-generate the same OID and find that it conflicts
    8591                 :             :      * with the leftover file or directory.  But since for safety's sake we
    8592                 :             :      * always loop until finding a nonconflicting filename, this poses no real
    8593                 :             :      * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
    8594                 :             :      */
    8595                 :         692 : }
    8596                 :             : 
    8597                 :             : /*
    8598                 :             :  * Write an XLOG SWITCH record.
    8599                 :             :  *
    8600                 :             :  * Here we just blindly issue an XLogInsert request for the record.
    8601                 :             :  * All the magic happens inside XLogInsert.
    8602                 :             :  *
    8603                 :             :  * The return value is either the end+1 address of the switch record,
    8604                 :             :  * or the end+1 address of the prior segment if we did not need to
    8605                 :             :  * write a switch record because we are already at segment start.
    8606                 :             :  */
    8607                 :             : XLogRecPtr
    8608                 :         818 : RequestXLogSwitch(bool mark_unimportant)
    8609                 :             : {
    8610                 :             :     XLogRecPtr  RecPtr;
    8611                 :             : 
    8612                 :             :     /* XLOG SWITCH has no data */
    8613                 :         818 :     XLogBeginInsert();
    8614                 :             : 
    8615         [ -  + ]:         818 :     if (mark_unimportant)
    8616                 :           0 :         XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    8617                 :         818 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
    8618                 :             : 
    8619                 :         818 :     return RecPtr;
    8620                 :             : }
    8621                 :             : 
    8622                 :             : /*
    8623                 :             :  * Write a RESTORE POINT record
    8624                 :             :  */
    8625                 :             : XLogRecPtr
    8626                 :           3 : XLogRestorePoint(const char *rpName)
    8627                 :             : {
    8628                 :             :     XLogRecPtr  RecPtr;
    8629                 :             :     xl_restore_point xlrec;
    8630                 :             : 
    8631                 :           3 :     xlrec.rp_time = GetCurrentTimestamp();
    8632                 :           3 :     strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
    8633                 :             : 
    8634                 :           3 :     XLogBeginInsert();
    8635                 :           3 :     XLogRegisterData(&xlrec, sizeof(xl_restore_point));
    8636                 :             : 
    8637                 :           3 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
    8638                 :             : 
    8639         [ +  - ]:           3 :     ereport(LOG,
    8640                 :             :             errmsg("restore point \"%s\" created at %X/%08X",
    8641                 :             :                    rpName, LSN_FORMAT_ARGS(RecPtr)));
    8642                 :             : 
    8643                 :           3 :     return RecPtr;
    8644                 :             : }
    8645                 :             : 
    8646                 :             : /*
    8647                 :             :  * Write an empty XLOG record to assign a distinct LSN.
    8648                 :             :  *
    8649                 :             :  * This is used by some index AMs when building indexes on permanent relations
    8650                 :             :  * with wal_level=minimal.  In that scenario, WAL-logging will start after
    8651                 :             :  * commit, but the index AM needs distinct LSNs to detect concurrent page
    8652                 :             :  * modifications.  When the current WAL insert position hasn't advanced since
    8653                 :             :  * the last call, we emit a dummy record to ensure we get a new, distinct LSN.
    8654                 :             :  */
    8655                 :             : XLogRecPtr
    8656                 :         442 : XLogAssignLSN(void)
    8657                 :             : {
    8658                 :         442 :     int         dummy = 0;
    8659                 :             : 
    8660                 :             :     /*
    8661                 :             :      * Records other than XLOG_SWITCH must have content.  We use an integer 0
    8662                 :             :      * to satisfy this restriction.
    8663                 :             :      */
    8664                 :         442 :     XLogBeginInsert();
    8665                 :         442 :     XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    8666                 :         442 :     XLogRegisterData(&dummy, sizeof(dummy));
    8667                 :         442 :     return XLogInsert(RM_XLOG_ID, XLOG_ASSIGN_LSN);
    8668                 :             : }
    8669                 :             : 
    8670                 :             : /*
    8671                 :             :  * Check if any of the GUC parameters that are critical for hot standby
    8672                 :             :  * have changed, and update the value in pg_control file if necessary.
    8673                 :             :  */
    8674                 :             : static void
    8675                 :        1018 : XLogReportParameters(void)
    8676                 :             : {
    8677         [ +  + ]:        1018 :     if (wal_level != ControlFile->wal_level ||
    8678         [ +  + ]:         750 :         wal_log_hints != ControlFile->wal_log_hints ||
    8679         [ +  + ]:         656 :         MaxConnections != ControlFile->MaxConnections ||
    8680         [ +  + ]:         655 :         max_worker_processes != ControlFile->max_worker_processes ||
    8681         [ +  + ]:         652 :         max_wal_senders != ControlFile->max_wal_senders ||
    8682         [ +  + ]:         624 :         max_prepared_xacts != ControlFile->max_prepared_xacts ||
    8683         [ +  - ]:         520 :         max_locks_per_xact != ControlFile->max_locks_per_xact ||
    8684         [ +  + ]:         520 :         track_commit_timestamp != ControlFile->track_commit_timestamp)
    8685                 :             :     {
    8686                 :             :         /*
    8687                 :             :          * The change in number of backend slots doesn't need to be WAL-logged
    8688                 :             :          * if archiving is not enabled, as you can't start archive recovery
    8689                 :             :          * with wal_level=minimal anyway. We don't really care about the
    8690                 :             :          * values in pg_control either if wal_level=minimal, but seems better
    8691                 :             :          * to keep them up-to-date to avoid confusion.
    8692                 :             :          */
    8693   [ +  +  +  + ]:         510 :         if (wal_level != ControlFile->wal_level || XLogIsNeeded())
    8694                 :             :         {
    8695                 :             :             xl_parameter_change xlrec;
    8696                 :             :             XLogRecPtr  recptr;
    8697                 :             : 
    8698                 :         483 :             xlrec.MaxConnections = MaxConnections;
    8699                 :         483 :             xlrec.max_worker_processes = max_worker_processes;
    8700                 :         483 :             xlrec.max_wal_senders = max_wal_senders;
    8701                 :         483 :             xlrec.max_prepared_xacts = max_prepared_xacts;
    8702                 :         483 :             xlrec.max_locks_per_xact = max_locks_per_xact;
    8703                 :         483 :             xlrec.wal_level = wal_level;
    8704                 :         483 :             xlrec.wal_log_hints = wal_log_hints;
    8705                 :         483 :             xlrec.track_commit_timestamp = track_commit_timestamp;
    8706                 :             : 
    8707                 :         483 :             XLogBeginInsert();
    8708                 :         483 :             XLogRegisterData(&xlrec, sizeof(xlrec));
    8709                 :             : 
    8710                 :         483 :             recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
    8711                 :         483 :             XLogFlush(recptr);
    8712                 :             :         }
    8713                 :             : 
    8714                 :         510 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8715                 :             : 
    8716                 :         510 :         ControlFile->MaxConnections = MaxConnections;
    8717                 :         510 :         ControlFile->max_worker_processes = max_worker_processes;
    8718                 :         510 :         ControlFile->max_wal_senders = max_wal_senders;
    8719                 :         510 :         ControlFile->max_prepared_xacts = max_prepared_xacts;
    8720                 :         510 :         ControlFile->max_locks_per_xact = max_locks_per_xact;
    8721                 :         510 :         ControlFile->wal_level = wal_level;
    8722                 :         510 :         ControlFile->wal_log_hints = wal_log_hints;
    8723                 :         510 :         ControlFile->track_commit_timestamp = track_commit_timestamp;
    8724                 :         510 :         UpdateControlFile();
    8725                 :             : 
    8726                 :         510 :         LWLockRelease(ControlFileLock);
    8727                 :             :     }
    8728                 :        1018 : }
    8729                 :             : 
    8730                 :             : /*
    8731                 :             :  * Log the new state of checksums
    8732                 :             :  */
    8733                 :             : static void
    8734                 :          31 : XLogChecksums(uint32 new_type)
    8735                 :             : {
    8736                 :             :     xl_checksum_state xlrec;
    8737                 :             :     XLogRecPtr  recptr;
    8738                 :             : 
    8739                 :          31 :     xlrec.new_checksum_state = new_type;
    8740                 :             : 
    8741                 :          31 :     XLogBeginInsert();
    8742                 :          31 :     XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state));
    8743                 :             : 
    8744                 :          31 :     recptr = XLogInsert(RM_XLOG2_ID, XLOG2_CHECKSUMS);
    8745                 :          31 :     XLogFlush(recptr);
    8746                 :          31 : }
    8747                 :             : 
    8748                 :             : /*
    8749                 :             :  * Update full_page_writes in shared memory, and write an
    8750                 :             :  * XLOG_FPW_CHANGE record if necessary.
    8751                 :             :  *
    8752                 :             :  * Note: this function assumes there is no other process running
    8753                 :             :  * concurrently that could update it.
    8754                 :             :  */
    8755                 :             : void
    8756                 :        1739 : UpdateFullPageWrites(void)
    8757                 :             : {
    8758                 :        1739 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    8759                 :             :     bool        recoveryInProgress;
    8760                 :             : 
    8761                 :             :     /*
    8762                 :             :      * Do nothing if full_page_writes has not been changed.
    8763                 :             :      *
    8764                 :             :      * It's safe to check the shared full_page_writes without the lock,
    8765                 :             :      * because we assume that there is no concurrently running process which
    8766                 :             :      * can update it.
    8767                 :             :      */
    8768         [ +  + ]:        1739 :     if (fullPageWrites == Insert->fullPageWrites)
    8769                 :        1236 :         return;
    8770                 :             : 
    8771                 :             :     /*
    8772                 :             :      * Perform this outside critical section so that the WAL insert
    8773                 :             :      * initialization done by RecoveryInProgress() doesn't trigger an
    8774                 :             :      * assertion failure.
    8775                 :             :      */
    8776                 :         503 :     recoveryInProgress = RecoveryInProgress();
    8777                 :             : 
    8778                 :         503 :     START_CRIT_SECTION();
    8779                 :             : 
    8780                 :             :     /*
    8781                 :             :      * It's always safe to take full page images, even when not strictly
    8782                 :             :      * required, but not the other round. So if we're setting full_page_writes
    8783                 :             :      * to true, first set it true and then write the WAL record. If we're
    8784                 :             :      * setting it to false, first write the WAL record and then set the global
    8785                 :             :      * flag.
    8786                 :             :      */
    8787         [ +  + ]:         503 :     if (fullPageWrites)
    8788                 :             :     {
    8789                 :         490 :         WALInsertLockAcquireExclusive();
    8790                 :         490 :         Insert->fullPageWrites = true;
    8791                 :         490 :         WALInsertLockRelease();
    8792                 :             :     }
    8793                 :             : 
    8794                 :             :     /*
    8795                 :             :      * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
    8796                 :             :      * full_page_writes during archive recovery, if required.
    8797                 :             :      */
    8798   [ +  +  -  + ]:         503 :     if (XLogStandbyInfoActive() && !recoveryInProgress)
    8799                 :             :     {
    8800                 :           0 :         XLogBeginInsert();
    8801                 :           0 :         XLogRegisterData(&fullPageWrites, sizeof(bool));
    8802                 :             : 
    8803                 :           0 :         XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
    8804                 :             :     }
    8805                 :             : 
    8806         [ +  + ]:         503 :     if (!fullPageWrites)
    8807                 :             :     {
    8808                 :          13 :         WALInsertLockAcquireExclusive();
    8809                 :          13 :         Insert->fullPageWrites = false;
    8810                 :          13 :         WALInsertLockRelease();
    8811                 :             :     }
    8812                 :         503 :     END_CRIT_SECTION();
    8813                 :             : }
    8814                 :             : 
    8815                 :             : /*
    8816                 :             :  * XLOG resource manager's routines
    8817                 :             :  *
    8818                 :             :  * Definitions of info values are in include/catalog/pg_control.h, though
    8819                 :             :  * not all record types are related to control file updates.
    8820                 :             :  *
    8821                 :             :  * NOTE: Some XLOG record types that are directly related to WAL recovery
    8822                 :             :  * are handled in xlogrecovery_redo().
    8823                 :             :  */
    8824                 :             : void
    8825                 :      117443 : xlog_redo(XLogReaderState *record)
    8826                 :             : {
    8827                 :      117443 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    8828                 :      117443 :     XLogRecPtr  lsn = record->EndRecPtr;
    8829                 :             : 
    8830                 :             :     /*
    8831                 :             :      * In XLOG rmgr, backup blocks are only used by XLOG_FPI and
    8832                 :             :      * XLOG_FPI_FOR_HINT records.
    8833                 :             :      */
    8834                 :             :     Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
    8835                 :             :            !XLogRecHasAnyBlockRefs(record));
    8836                 :             : 
    8837         [ +  + ]:      117443 :     if (info == XLOG_NEXTOID)
    8838                 :             :     {
    8839                 :             :         Oid         nextOid;
    8840                 :             : 
    8841                 :             :         /*
    8842                 :             :          * We used to try to take the maximum of TransamVariables->nextOid and
    8843                 :             :          * the recorded nextOid, but that fails if the OID counter wraps
    8844                 :             :          * around.  Since no OID allocation should be happening during replay
    8845                 :             :          * anyway, better to just believe the record exactly.  We still take
    8846                 :             :          * OidGenLock while setting the variable, just in case.
    8847                 :             :          */
    8848                 :         100 :         memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
    8849                 :         100 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    8850                 :         100 :         TransamVariables->nextOid = nextOid;
    8851                 :         100 :         TransamVariables->oidCount = 0;
    8852                 :         100 :         LWLockRelease(OidGenLock);
    8853                 :             :     }
    8854         [ +  + ]:      117343 :     else if (info == XLOG_CHECKPOINT_SHUTDOWN)
    8855                 :             :     {
    8856                 :             :         CheckPoint  checkPoint;
    8857                 :             :         TimeLineID  replayTLI;
    8858                 :             : 
    8859                 :          43 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    8860                 :             :         /* In a SHUTDOWN checkpoint, believe the counters exactly */
    8861                 :          43 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    8862                 :          43 :         TransamVariables->nextXid = checkPoint.nextXid;
    8863                 :          43 :         LWLockRelease(XidGenLock);
    8864                 :          43 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    8865                 :          43 :         TransamVariables->nextOid = checkPoint.nextOid;
    8866                 :          43 :         TransamVariables->oidCount = 0;
    8867                 :          43 :         LWLockRelease(OidGenLock);
    8868                 :          43 :         MultiXactSetNextMXact(checkPoint.nextMulti,
    8869                 :             :                               checkPoint.nextMultiOffset);
    8870                 :             : 
    8871                 :          43 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
    8872                 :             :                                checkPoint.oldestMultiDB);
    8873                 :             : 
    8874                 :             :         /*
    8875                 :             :          * No need to set oldestClogXid here as well; it'll be set when we
    8876                 :             :          * redo an xl_clog_truncate if it changed since initialization.
    8877                 :             :          */
    8878                 :          43 :         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    8879                 :             : 
    8880                 :             :         /*
    8881                 :             :          * If we see a shutdown checkpoint while waiting for an end-of-backup
    8882                 :             :          * record, the backup was canceled and the end-of-backup record will
    8883                 :             :          * never arrive.
    8884                 :             :          */
    8885         [ +  + ]:          43 :         if (ArchiveRecoveryRequested &&
    8886         [ -  + ]:          42 :             XLogRecPtrIsValid(ControlFile->backupStartPoint) &&
    8887         [ #  # ]:           0 :             !XLogRecPtrIsValid(ControlFile->backupEndPoint))
    8888         [ #  # ]:           0 :             ereport(PANIC,
    8889                 :             :                     (errmsg("online backup was canceled, recovery cannot continue")));
    8890                 :             : 
    8891                 :             :         /*
    8892                 :             :          * If we see a shutdown checkpoint, we know that nothing was running
    8893                 :             :          * on the primary at this point. So fake-up an empty running-xacts
    8894                 :             :          * record and use that here and now. Recover additional standby state
    8895                 :             :          * for prepared transactions.
    8896                 :             :          */
    8897         [ +  + ]:          43 :         if (standbyState >= STANDBY_INITIALIZED)
    8898                 :             :         {
    8899                 :             :             TransactionId *xids;
    8900                 :             :             int         nxids;
    8901                 :             :             TransactionId oldestActiveXID;
    8902                 :             :             TransactionId latestCompletedXid;
    8903                 :             :             RunningTransactionsData running;
    8904                 :             : 
    8905                 :          40 :             oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    8906                 :             : 
    8907                 :             :             /* Update pg_subtrans entries for any prepared transactions */
    8908                 :          40 :             StandbyRecoverPreparedTransactions();
    8909                 :             : 
    8910                 :             :             /*
    8911                 :             :              * Construct a RunningTransactions snapshot representing a shut
    8912                 :             :              * down server, with only prepared transactions still alive. We're
    8913                 :             :              * never overflowed at this point because all subxids are listed
    8914                 :             :              * with their parent prepared transactions.
    8915                 :             :              */
    8916                 :          40 :             running.xcnt = nxids;
    8917                 :          40 :             running.subxcnt = 0;
    8918                 :          40 :             running.subxid_status = SUBXIDS_IN_SUBTRANS;
    8919                 :          40 :             running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
    8920                 :          40 :             running.oldestRunningXid = oldestActiveXID;
    8921                 :          40 :             latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
    8922         [ -  + ]:          40 :             TransactionIdRetreat(latestCompletedXid);
    8923                 :             :             Assert(TransactionIdIsNormal(latestCompletedXid));
    8924                 :          40 :             running.latestCompletedXid = latestCompletedXid;
    8925                 :          40 :             running.xids = xids;
    8926                 :             : 
    8927                 :          40 :             ProcArrayApplyRecoveryInfo(&running);
    8928                 :             :         }
    8929                 :             : 
    8930                 :             :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    8931                 :          43 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8932                 :          43 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    8933                 :          43 :         ControlFile->data_checksum_version = checkPoint.dataChecksumState;
    8934                 :             : 
    8935                 :          43 :         UpdateControlFile();
    8936                 :          43 :         LWLockRelease(ControlFileLock);
    8937                 :             : 
    8938                 :             :         /*
    8939                 :             :          * We should've already switched to the new TLI before replaying this
    8940                 :             :          * record.
    8941                 :             :          */
    8942                 :          43 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    8943         [ -  + ]:          43 :         if (checkPoint.ThisTimeLineID != replayTLI)
    8944         [ #  # ]:           0 :             ereport(PANIC,
    8945                 :             :                     (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
    8946                 :             :                             checkPoint.ThisTimeLineID, replayTLI)));
    8947                 :             : 
    8948                 :          43 :         RecoveryRestartPoint(&checkPoint, record);
    8949                 :             : 
    8950                 :             :         /*
    8951                 :             :          * After replaying a checkpoint record, free all smgr objects.
    8952                 :             :          * Otherwise we would never do so for dropped relations, as the
    8953                 :             :          * startup does not process shared invalidation messages or call
    8954                 :             :          * AtEOXact_SMgr().
    8955                 :             :          */
    8956                 :          43 :         smgrdestroyall();
    8957                 :             :     }
    8958         [ +  + ]:      117300 :     else if (info == XLOG_CHECKPOINT_ONLINE)
    8959                 :             :     {
    8960                 :             :         CheckPoint  checkPoint;
    8961                 :             :         TimeLineID  replayTLI;
    8962                 :             : 
    8963                 :         712 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    8964                 :             :         /* In an ONLINE checkpoint, treat the XID counter as a minimum */
    8965                 :         712 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    8966         [ -  + ]:         712 :         if (FullTransactionIdPrecedes(TransamVariables->nextXid,
    8967                 :             :                                       checkPoint.nextXid))
    8968                 :           0 :             TransamVariables->nextXid = checkPoint.nextXid;
    8969                 :         712 :         LWLockRelease(XidGenLock);
    8970                 :             : 
    8971                 :             :         /*
    8972                 :             :          * We ignore the nextOid counter in an ONLINE checkpoint, preferring
    8973                 :             :          * to track OID assignment through XLOG_NEXTOID records.  The nextOid
    8974                 :             :          * counter is from the start of the checkpoint and might well be stale
    8975                 :             :          * compared to later XLOG_NEXTOID records.  We could try to take the
    8976                 :             :          * maximum of the nextOid counter and our latest value, but since
    8977                 :             :          * there's no particular guarantee about the speed with which the OID
    8978                 :             :          * counter wraps around, that's a risky thing to do.  In any case,
    8979                 :             :          * users of the nextOid counter are required to avoid assignment of
    8980                 :             :          * duplicates, so that a somewhat out-of-date value should be safe.
    8981                 :             :          */
    8982                 :             : 
    8983                 :             :         /* Handle multixact */
    8984                 :         712 :         MultiXactAdvanceNextMXact(checkPoint.nextMulti,
    8985                 :             :                                   checkPoint.nextMultiOffset);
    8986                 :             : 
    8987                 :             :         /*
    8988                 :             :          * NB: This may perform multixact truncation when replaying WAL
    8989                 :             :          * generated by an older primary.
    8990                 :             :          */
    8991                 :         712 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
    8992                 :             :                                checkPoint.oldestMultiDB);
    8993         [ -  + ]:         712 :         if (TransactionIdPrecedes(TransamVariables->oldestXid,
    8994                 :             :                                   checkPoint.oldestXid))
    8995                 :           0 :             SetTransactionIdLimit(checkPoint.oldestXid,
    8996                 :             :                                   checkPoint.oldestXidDB);
    8997                 :             :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    8998                 :         712 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8999                 :         712 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    9000                 :         712 :         LWLockRelease(ControlFileLock);
    9001                 :             : 
    9002                 :             :         /* TLI should not change in an on-line checkpoint */
    9003                 :         712 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    9004         [ -  + ]:         712 :         if (checkPoint.ThisTimeLineID != replayTLI)
    9005         [ #  # ]:           0 :             ereport(PANIC,
    9006                 :             :                     (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
    9007                 :             :                             checkPoint.ThisTimeLineID, replayTLI)));
    9008                 :             : 
    9009                 :         712 :         RecoveryRestartPoint(&checkPoint, record);
    9010                 :             : 
    9011                 :             :         /*
    9012                 :             :          * After replaying a checkpoint record, free all smgr objects.
    9013                 :             :          * Otherwise we would never do so for dropped relations, as the
    9014                 :             :          * startup does not process shared invalidation messages or call
    9015                 :             :          * AtEOXact_SMgr().
    9016                 :             :          */
    9017                 :         712 :         smgrdestroyall();
    9018                 :             :     }
    9019         [ +  + ]:      116588 :     else if (info == XLOG_OVERWRITE_CONTRECORD)
    9020                 :             :     {
    9021                 :             :         /* nothing to do here, handled in xlogrecovery_redo() */
    9022                 :             :     }
    9023         [ +  + ]:      116587 :     else if (info == XLOG_END_OF_RECOVERY)
    9024                 :             :     {
    9025                 :             :         xl_end_of_recovery xlrec;
    9026                 :             :         TimeLineID  replayTLI;
    9027                 :             : 
    9028                 :          13 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
    9029                 :             : 
    9030                 :             :         /*
    9031                 :             :          * For Hot Standby, we could treat this like a Shutdown Checkpoint,
    9032                 :             :          * but this case is rarer and harder to test, so the benefit doesn't
    9033                 :             :          * outweigh the potential extra cost of maintenance.
    9034                 :             :          */
    9035                 :             : 
    9036                 :             :         /*
    9037                 :             :          * We should've already switched to the new TLI before replaying this
    9038                 :             :          * record.
    9039                 :             :          */
    9040                 :          13 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    9041         [ -  + ]:          13 :         if (xlrec.ThisTimeLineID != replayTLI)
    9042         [ #  # ]:           0 :             ereport(PANIC,
    9043                 :             :                     (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
    9044                 :             :                             xlrec.ThisTimeLineID, replayTLI)));
    9045                 :             :     }
    9046         [ +  - ]:      116574 :     else if (info == XLOG_NOOP)
    9047                 :             :     {
    9048                 :             :         /* nothing to do here */
    9049                 :             :     }
    9050         [ +  + ]:      116574 :     else if (info == XLOG_SWITCH)
    9051                 :             :     {
    9052                 :             :         /* nothing to do here */
    9053                 :             :     }
    9054         [ +  + ]:      116107 :     else if (info == XLOG_RESTORE_POINT)
    9055                 :             :     {
    9056                 :             :         /* nothing to do here, handled in xlogrecovery.c */
    9057                 :             :     }
    9058         [ +  + ]:      116102 :     else if (info == XLOG_ASSIGN_LSN)
    9059                 :             :     {
    9060                 :             :         /* nothing to do here, see XLogGetFakeLSN() */
    9061                 :             :     }
    9062   [ +  +  +  + ]:       53775 :     else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
    9063                 :             :     {
    9064                 :             :         /*
    9065                 :             :          * XLOG_FPI records contain nothing else but one or more block
    9066                 :             :          * references. Every block reference must include a full-page image
    9067                 :             :          * even if full_page_writes was disabled when the record was generated
    9068                 :             :          * - otherwise there would be no point in this record.
    9069                 :             :          *
    9070                 :             :          * XLOG_FPI_FOR_HINT records are generated when a page needs to be
    9071                 :             :          * WAL-logged because of a hint bit update. They are only generated
    9072                 :             :          * when checksums and/or wal_log_hints are enabled. They may include
    9073                 :             :          * no full-page images if full_page_writes was disabled when they were
    9074                 :             :          * generated. In this case there is nothing to do here.
    9075                 :             :          *
    9076                 :             :          * No recovery conflicts are generated by these generic records - if a
    9077                 :             :          * resource manager needs to generate conflicts, it has to define a
    9078                 :             :          * separate WAL record type and redo routine.
    9079                 :             :          */
    9080         [ +  + ]:      110692 :         for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
    9081                 :             :         {
    9082                 :             :             Buffer      buffer;
    9083                 :             : 
    9084         [ +  + ]:       57790 :             if (!XLogRecHasBlockImage(record, block_id))
    9085                 :             :             {
    9086         [ -  + ]:          66 :                 if (info == XLOG_FPI)
    9087         [ #  # ]:           0 :                     elog(ERROR, "XLOG_FPI record did not contain a full-page image");
    9088                 :          66 :                 continue;
    9089                 :             :             }
    9090                 :             : 
    9091         [ -  + ]:       57724 :             if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
    9092         [ #  # ]:           0 :                 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
    9093                 :       57724 :             UnlockReleaseBuffer(buffer);
    9094                 :             :         }
    9095                 :             :     }
    9096         [ +  + ]:         873 :     else if (info == XLOG_BACKUP_END)
    9097                 :             :     {
    9098                 :             :         /* nothing to do here, handled in xlogrecovery_redo() */
    9099                 :             :     }
    9100         [ +  + ]:         770 :     else if (info == XLOG_PARAMETER_CHANGE)
    9101                 :             :     {
    9102                 :             :         xl_parameter_change xlrec;
    9103                 :             : 
    9104                 :             :         /* Update our copy of the parameters in pg_control */
    9105                 :          38 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
    9106                 :             : 
    9107                 :          38 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9108                 :          38 :         ControlFile->MaxConnections = xlrec.MaxConnections;
    9109                 :          38 :         ControlFile->max_worker_processes = xlrec.max_worker_processes;
    9110                 :          38 :         ControlFile->max_wal_senders = xlrec.max_wal_senders;
    9111                 :          38 :         ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
    9112                 :          38 :         ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
    9113                 :          38 :         ControlFile->wal_level = xlrec.wal_level;
    9114                 :          38 :         ControlFile->wal_log_hints = xlrec.wal_log_hints;
    9115                 :             : 
    9116                 :             :         /*
    9117                 :             :          * Update minRecoveryPoint to ensure that if recovery is aborted, we
    9118                 :             :          * recover back up to this point before allowing hot standby again.
    9119                 :             :          * This is important if the max_* settings are decreased, to ensure
    9120                 :             :          * you don't run queries against the WAL preceding the change. The
    9121                 :             :          * local copies cannot be updated as long as crash recovery is
    9122                 :             :          * happening and we expect all the WAL to be replayed.
    9123                 :             :          */
    9124         [ +  + ]:          38 :         if (InArchiveRecovery)
    9125                 :             :         {
    9126                 :          23 :             LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    9127                 :          23 :             LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    9128                 :             :         }
    9129   [ +  +  +  + ]:          38 :         if (XLogRecPtrIsValid(LocalMinRecoveryPoint) && LocalMinRecoveryPoint < lsn)
    9130                 :             :         {
    9131                 :             :             TimeLineID  replayTLI;
    9132                 :             : 
    9133                 :          12 :             (void) GetCurrentReplayRecPtr(&replayTLI);
    9134                 :          12 :             ControlFile->minRecoveryPoint = lsn;
    9135                 :          12 :             ControlFile->minRecoveryPointTLI = replayTLI;
    9136                 :             :         }
    9137                 :             : 
    9138                 :          38 :         CommitTsParameterChange(xlrec.track_commit_timestamp,
    9139                 :          38 :                                 ControlFile->track_commit_timestamp);
    9140                 :          38 :         ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
    9141                 :             : 
    9142                 :          38 :         UpdateControlFile();
    9143                 :          38 :         LWLockRelease(ControlFileLock);
    9144                 :             : 
    9145                 :             :         /* Check to see if any parameter change gives a problem on recovery */
    9146                 :          38 :         CheckRequiredParameterValues();
    9147                 :             :     }
    9148         [ -  + ]:         732 :     else if (info == XLOG_FPW_CHANGE)
    9149                 :             :     {
    9150                 :             :         bool        fpw;
    9151                 :             : 
    9152                 :           0 :         memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
    9153                 :             : 
    9154                 :             :         /*
    9155                 :             :          * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
    9156                 :             :          * do_pg_backup_start() and do_pg_backup_stop() can check whether
    9157                 :             :          * full_page_writes has been disabled during online backup.
    9158                 :             :          */
    9159         [ #  # ]:           0 :         if (!fpw)
    9160                 :             :         {
    9161                 :           0 :             SpinLockAcquire(&XLogCtl->info_lck);
    9162         [ #  # ]:           0 :             if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
    9163                 :           0 :                 XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
    9164                 :           0 :             SpinLockRelease(&XLogCtl->info_lck);
    9165                 :             :         }
    9166                 :             : 
    9167                 :             :         /* Keep track of full_page_writes */
    9168                 :           0 :         lastFullPageWrites = fpw;
    9169                 :             :     }
    9170         [ +  + ]:         732 :     else if (info == XLOG_CHECKPOINT_REDO)
    9171                 :             :     {
    9172                 :             :         xl_checkpoint_redo redo_rec;
    9173                 :         714 :         bool        new_state = false;
    9174                 :             : 
    9175                 :         714 :         memcpy(&redo_rec, XLogRecGetData(record), sizeof(xl_checkpoint_redo));
    9176                 :             : 
    9177                 :         714 :         SpinLockAcquire(&XLogCtl->info_lck);
    9178                 :         714 :         XLogCtl->data_checksum_version = redo_rec.data_checksum_version;
    9179                 :         714 :         SetLocalDataChecksumState(redo_rec.data_checksum_version);
    9180         [ -  + ]:         714 :         if (redo_rec.data_checksum_version != ControlFile->data_checksum_version)
    9181                 :           0 :             new_state = true;
    9182                 :         714 :         SpinLockRelease(&XLogCtl->info_lck);
    9183                 :             : 
    9184         [ -  + ]:         714 :         if (new_state)
    9185                 :           0 :             EmitAndWaitDataChecksumsBarrier(redo_rec.data_checksum_version);
    9186                 :             :     }
    9187         [ +  - ]:          18 :     else if (info == XLOG_LOGICAL_DECODING_STATUS_CHANGE)
    9188                 :             :     {
    9189                 :             :         bool        status;
    9190                 :             : 
    9191                 :          18 :         memcpy(&status, XLogRecGetData(record), sizeof(bool));
    9192                 :             : 
    9193                 :             :         /*
    9194                 :             :          * We need to toggle the logical decoding status and update the
    9195                 :             :          * XLogLogicalInfo cache of processes synchronously because
    9196                 :             :          * XLogLogicalInfoActive() is used even during read-only queries
    9197                 :             :          * (e.g., via RelationIsAccessibleInLogicalDecoding()). In the
    9198                 :             :          * 'disable' case, it is safe to invalidate existing slots after
    9199                 :             :          * disabling logical decoding because logical decoding cannot process
    9200                 :             :          * subsequent WAL records, which may not contain logical information.
    9201                 :             :          */
    9202         [ +  + ]:          18 :         if (status)
    9203                 :           9 :             EnableLogicalDecoding();
    9204                 :             :         else
    9205                 :           9 :             DisableLogicalDecoding();
    9206                 :             : 
    9207         [ +  + ]:          18 :         elog(DEBUG1, "update logical decoding status to %d during recovery",
    9208                 :             :              status);
    9209                 :             : 
    9210   [ +  -  +  + ]:          18 :         if (InRecovery && InHotStandby)
    9211                 :             :         {
    9212         [ +  + ]:          16 :             if (!status)
    9213                 :             :             {
    9214                 :             :                 /*
    9215                 :             :                  * Invalidate logical slots if we are in hot standby and the
    9216                 :             :                  * primary disabled logical decoding.
    9217                 :             :                  */
    9218                 :           9 :                 InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_LEVEL,
    9219                 :             :                                                    0, InvalidOid,
    9220                 :             :                                                    InvalidTransactionId);
    9221                 :             :             }
    9222         [ -  + ]:           7 :             else if (sync_replication_slots)
    9223                 :             :             {
    9224                 :             :                 /*
    9225                 :             :                  * Signal the postmaster to launch the slotsync worker.
    9226                 :             :                  *
    9227                 :             :                  * XXX: For simplicity, we keep the slotsync worker running
    9228                 :             :                  * even after logical decoding is disabled. A future
    9229                 :             :                  * improvement can consider starting and stopping the worker
    9230                 :             :                  * based on logical decoding status change.
    9231                 :             :                  */
    9232                 :           0 :                 kill(PostmasterPid, SIGUSR1);
    9233                 :             :             }
    9234                 :             :         }
    9235                 :             :     }
    9236                 :      117441 : }
    9237                 :             : 
    9238                 :             : void
    9239                 :           7 : xlog2_redo(XLogReaderState *record)
    9240                 :             : {
    9241                 :           7 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    9242                 :             : 
    9243         [ +  - ]:           7 :     if (info == XLOG2_CHECKSUMS)
    9244                 :             :     {
    9245                 :             :         xl_checksum_state state;
    9246                 :             : 
    9247                 :           7 :         memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state));
    9248                 :             : 
    9249                 :           7 :         SpinLockAcquire(&XLogCtl->info_lck);
    9250                 :           7 :         XLogCtl->data_checksum_version = state.new_checksum_state;
    9251                 :           7 :         SpinLockRelease(&XLogCtl->info_lck);
    9252                 :             : 
    9253                 :           7 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9254                 :           7 :         ControlFile->data_checksum_version = state.new_checksum_state;
    9255                 :           7 :         UpdateControlFile();
    9256                 :           7 :         LWLockRelease(ControlFileLock);
    9257                 :             : 
    9258                 :             :         /*
    9259                 :             :          * Block on a procsignalbarrier to await all processes having seen the
    9260                 :             :          * change to checksum status. Once the barrier has been passed we can
    9261                 :             :          * initiate the corresponding processing.
    9262                 :             :          */
    9263                 :           7 :         EmitAndWaitDataChecksumsBarrier(state.new_checksum_state);
    9264                 :             :     }
    9265                 :           7 : }
    9266                 :             : 
    9267                 :             : /*
    9268                 :             :  * Return the extra open flags used for opening a file, depending on the
    9269                 :             :  * value of the GUCs wal_sync_method, fsync and debug_io_direct.
    9270                 :             :  */
    9271                 :             : static int
    9272                 :       17895 : get_sync_bit(int method)
    9273                 :             : {
    9274                 :       17895 :     int         o_direct_flag = 0;
    9275                 :             : 
    9276                 :             :     /*
    9277                 :             :      * Use O_DIRECT if requested, except in walreceiver process.  The WAL
    9278                 :             :      * written by walreceiver is normally read by the startup process soon
    9279                 :             :      * after it's written.  Also, walreceiver performs unaligned writes, which
    9280                 :             :      * don't work with O_DIRECT, so it is required for correctness too.
    9281                 :             :      */
    9282   [ +  +  +  - ]:       17895 :     if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
    9283                 :           7 :         o_direct_flag = PG_O_DIRECT;
    9284                 :             : 
    9285                 :             :     /* If fsync is disabled, never open in sync mode */
    9286         [ +  - ]:       17895 :     if (!enableFsync)
    9287                 :       17895 :         return o_direct_flag;
    9288                 :             : 
    9289   [ #  #  #  # ]:           0 :     switch (method)
    9290                 :             :     {
    9291                 :             :             /*
    9292                 :             :              * enum values for all sync options are defined even if they are
    9293                 :             :              * not supported on the current platform.  But if not, they are
    9294                 :             :              * not included in the enum option array, and therefore will never
    9295                 :             :              * be seen here.
    9296                 :             :              */
    9297                 :           0 :         case WAL_SYNC_METHOD_FSYNC:
    9298                 :             :         case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
    9299                 :             :         case WAL_SYNC_METHOD_FDATASYNC:
    9300                 :           0 :             return o_direct_flag;
    9301                 :             : #ifdef O_SYNC
    9302                 :           0 :         case WAL_SYNC_METHOD_OPEN:
    9303                 :           0 :             return O_SYNC | o_direct_flag;
    9304                 :             : #endif
    9305                 :             : #ifdef O_DSYNC
    9306                 :           0 :         case WAL_SYNC_METHOD_OPEN_DSYNC:
    9307                 :           0 :             return O_DSYNC | o_direct_flag;
    9308                 :             : #endif
    9309                 :           0 :         default:
    9310                 :             :             /* can't happen (unless we are out of sync with option array) */
    9311         [ #  # ]:           0 :             elog(ERROR, "unrecognized \"wal_sync_method\": %d", method);
    9312                 :             :             return 0;           /* silence warning */
    9313                 :             :     }
    9314                 :             : }
    9315                 :             : 
    9316                 :             : /*
    9317                 :             :  * GUC support
    9318                 :             :  */
    9319                 :             : void
    9320                 :        1291 : assign_wal_sync_method(int new_wal_sync_method, void *extra)
    9321                 :             : {
    9322         [ -  + ]:        1291 :     if (wal_sync_method != new_wal_sync_method)
    9323                 :             :     {
    9324                 :             :         /*
    9325                 :             :          * To ensure that no blocks escape unsynced, force an fsync on the
    9326                 :             :          * currently open log segment (if any).  Also, if the open flag is
    9327                 :             :          * changing, close the log file so it will be reopened (with new flag
    9328                 :             :          * bit) at next use.
    9329                 :             :          */
    9330         [ #  # ]:           0 :         if (openLogFile >= 0)
    9331                 :             :         {
    9332                 :           0 :             pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
    9333         [ #  # ]:           0 :             if (pg_fsync(openLogFile) != 0)
    9334                 :             :             {
    9335                 :             :                 char        xlogfname[MAXFNAMELEN];
    9336                 :             :                 int         save_errno;
    9337                 :             : 
    9338                 :           0 :                 save_errno = errno;
    9339                 :           0 :                 XLogFileName(xlogfname, openLogTLI, openLogSegNo,
    9340                 :             :                              wal_segment_size);
    9341                 :           0 :                 errno = save_errno;
    9342         [ #  # ]:           0 :                 ereport(PANIC,
    9343                 :             :                         (errcode_for_file_access(),
    9344                 :             :                          errmsg("could not fsync file \"%s\": %m", xlogfname)));
    9345                 :             :             }
    9346                 :             : 
    9347                 :           0 :             pgstat_report_wait_end();
    9348         [ #  # ]:           0 :             if (get_sync_bit(wal_sync_method) != get_sync_bit(new_wal_sync_method))
    9349                 :           0 :                 XLogFileClose();
    9350                 :             :         }
    9351                 :             :     }
    9352                 :        1291 : }
    9353                 :             : 
    9354                 :             : 
    9355                 :             : /*
    9356                 :             :  * Issue appropriate kind of fsync (if any) for an XLOG output file.
    9357                 :             :  *
    9358                 :             :  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
    9359                 :             :  * 'segno' is for error reporting purposes.
    9360                 :             :  */
    9361                 :             : void
    9362                 :      217376 : issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
    9363                 :             : {
    9364                 :      217376 :     char       *msg = NULL;
    9365                 :             :     instr_time  start;
    9366                 :             : 
    9367                 :             :     Assert(tli != 0);
    9368                 :             : 
    9369                 :             :     /*
    9370                 :             :      * Quick exit if fsync is disabled or write() has already synced the WAL
    9371                 :             :      * file.
    9372                 :             :      */
    9373         [ -  + ]:      217376 :     if (!enableFsync ||
    9374         [ #  # ]:           0 :         wal_sync_method == WAL_SYNC_METHOD_OPEN ||
    9375         [ #  # ]:           0 :         wal_sync_method == WAL_SYNC_METHOD_OPEN_DSYNC)
    9376                 :      217376 :         return;
    9377                 :             : 
    9378                 :             :     /*
    9379                 :             :      * Measure I/O timing to sync the WAL file for pg_stat_io.
    9380                 :             :      */
    9381                 :           0 :     start = pgstat_prepare_io_time(track_wal_io_timing);
    9382                 :             : 
    9383                 :           0 :     pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
    9384   [ #  #  #  # ]:           0 :     switch (wal_sync_method)
    9385                 :             :     {
    9386                 :           0 :         case WAL_SYNC_METHOD_FSYNC:
    9387         [ #  # ]:           0 :             if (pg_fsync_no_writethrough(fd) != 0)
    9388                 :           0 :                 msg = _("could not fsync file \"%s\": %m");
    9389                 :           0 :             break;
    9390                 :             : #ifdef HAVE_FSYNC_WRITETHROUGH
    9391                 :             :         case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
    9392                 :             :             if (pg_fsync_writethrough(fd) != 0)
    9393                 :             :                 msg = _("could not fsync write-through file \"%s\": %m");
    9394                 :             :             break;
    9395                 :             : #endif
    9396                 :           0 :         case WAL_SYNC_METHOD_FDATASYNC:
    9397         [ #  # ]:           0 :             if (pg_fdatasync(fd) != 0)
    9398                 :           0 :                 msg = _("could not fdatasync file \"%s\": %m");
    9399                 :           0 :             break;
    9400                 :           0 :         case WAL_SYNC_METHOD_OPEN:
    9401                 :             :         case WAL_SYNC_METHOD_OPEN_DSYNC:
    9402                 :             :             /* not reachable */
    9403                 :             :             Assert(false);
    9404                 :           0 :             break;
    9405                 :           0 :         default:
    9406         [ #  # ]:           0 :             ereport(PANIC,
    9407                 :             :                     errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    9408                 :             :                     errmsg_internal("unrecognized \"wal_sync_method\": %d", wal_sync_method));
    9409                 :             :             break;
    9410                 :             :     }
    9411                 :             : 
    9412                 :             :     /* PANIC if failed to fsync */
    9413         [ #  # ]:           0 :     if (msg)
    9414                 :             :     {
    9415                 :             :         char        xlogfname[MAXFNAMELEN];
    9416                 :           0 :         int         save_errno = errno;
    9417                 :             : 
    9418                 :           0 :         XLogFileName(xlogfname, tli, segno, wal_segment_size);
    9419                 :           0 :         errno = save_errno;
    9420         [ #  # ]:           0 :         ereport(PANIC,
    9421                 :             :                 (errcode_for_file_access(),
    9422                 :             :                  errmsg(msg, xlogfname)));
    9423                 :             :     }
    9424                 :             : 
    9425                 :           0 :     pgstat_report_wait_end();
    9426                 :             : 
    9427                 :           0 :     pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC,
    9428                 :             :                             start, 1, 0);
    9429                 :             : }
    9430                 :             : 
    9431                 :             : /*
    9432                 :             :  * do_pg_backup_start is the workhorse of the user-visible pg_backup_start()
    9433                 :             :  * function. It creates the necessary starting checkpoint and constructs the
    9434                 :             :  * backup state and tablespace map.
    9435                 :             :  *
    9436                 :             :  * Input parameters are "state" (the backup state), "fast" (if true, we do
    9437                 :             :  * the checkpoint in fast mode), and "tablespaces" (if non-NULL, indicates a
    9438                 :             :  * list of tablespaceinfo structs describing the cluster's tablespaces.).
    9439                 :             :  *
    9440                 :             :  * The tablespace map contents are appended to passed-in parameter
    9441                 :             :  * tablespace_map and the caller is responsible for including it in the backup
    9442                 :             :  * archive as 'tablespace_map'. The tablespace_map file is required mainly for
    9443                 :             :  * tar format in windows as native windows utilities are not able to create
    9444                 :             :  * symlinks while extracting files from tar. However for consistency and
    9445                 :             :  * platform-independence, we do it the same way everywhere.
    9446                 :             :  *
    9447                 :             :  * It fills in "state" with the information required for the backup, such
    9448                 :             :  * as the minimum WAL location that must be present to restore from this
    9449                 :             :  * backup (starttli) and the corresponding timeline ID (starttli).
    9450                 :             :  *
    9451                 :             :  * Every successfully started backup must be stopped by calling
    9452                 :             :  * do_pg_backup_stop() or do_pg_abort_backup(). There can be many
    9453                 :             :  * backups active at the same time.
    9454                 :             :  *
    9455                 :             :  * It is the responsibility of the caller of this function to verify the
    9456                 :             :  * permissions of the calling user!
    9457                 :             :  */
    9458                 :             : void
    9459                 :         178 : do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces,
    9460                 :             :                    BackupState *state, StringInfo tblspcmapfile)
    9461                 :             : {
    9462                 :             :     bool        backup_started_in_recovery;
    9463                 :             : 
    9464                 :             :     Assert(state != NULL);
    9465                 :         178 :     backup_started_in_recovery = RecoveryInProgress();
    9466                 :             : 
    9467                 :             :     /*
    9468                 :             :      * During recovery, we don't need to check WAL level. Because, if WAL
    9469                 :             :      * level is not sufficient, it's impossible to get here during recovery.
    9470                 :             :      */
    9471   [ +  +  -  + ]:         178 :     if (!backup_started_in_recovery && !XLogIsNeeded())
    9472         [ #  # ]:           0 :         ereport(ERROR,
    9473                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9474                 :             :                  errmsg("WAL level not sufficient for making an online backup"),
    9475                 :             :                  errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
    9476                 :             : 
    9477         [ +  + ]:         178 :     if (strlen(backupidstr) > MAXPGPATH)
    9478         [ +  - ]:           1 :         ereport(ERROR,
    9479                 :             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    9480                 :             :                  errmsg("backup label too long (max %d bytes)",
    9481                 :             :                         MAXPGPATH)));
    9482                 :             : 
    9483                 :         177 :     strlcpy(state->name, backupidstr, sizeof(state->name));
    9484                 :             : 
    9485                 :             :     /*
    9486                 :             :      * Mark backup active in shared memory.  We must do full-page WAL writes
    9487                 :             :      * during an on-line backup even if not doing so at other times, because
    9488                 :             :      * it's quite possible for the backup dump to obtain a "torn" (partially
    9489                 :             :      * written) copy of a database page if it reads the page concurrently with
    9490                 :             :      * our write to the same page.  This can be fixed as long as the first
    9491                 :             :      * write to the page in the WAL sequence is a full-page write. Hence, we
    9492                 :             :      * increment runningBackups then force a CHECKPOINT, to ensure there are
    9493                 :             :      * no dirty pages in shared memory that might get dumped while the backup
    9494                 :             :      * is in progress without having a corresponding WAL record.  (Once the
    9495                 :             :      * backup is complete, we need not force full-page writes anymore, since
    9496                 :             :      * we expect that any pages not modified during the backup interval must
    9497                 :             :      * have been correctly captured by the backup.)
    9498                 :             :      *
    9499                 :             :      * Note that forcing full-page writes has no effect during an online
    9500                 :             :      * backup from the standby.
    9501                 :             :      *
    9502                 :             :      * We must hold all the insertion locks to change the value of
    9503                 :             :      * runningBackups, to ensure adequate interlocking against
    9504                 :             :      * XLogInsertRecord().
    9505                 :             :      */
    9506                 :         177 :     WALInsertLockAcquireExclusive();
    9507                 :         177 :     XLogCtl->Insert.runningBackups++;
    9508                 :         177 :     WALInsertLockRelease();
    9509                 :             : 
    9510                 :             :     /*
    9511                 :             :      * Ensure we decrement runningBackups if we fail below. NB -- for this to
    9512                 :             :      * work correctly, it is critical that sessionBackupState is only updated
    9513                 :             :      * after this block is over.
    9514                 :             :      */
    9515         [ +  - ]:         177 :     PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(true));
    9516                 :             :     {
    9517                 :         177 :         bool        gotUniqueStartpoint = false;
    9518                 :             :         DIR        *tblspcdir;
    9519                 :             :         struct dirent *de;
    9520                 :             :         tablespaceinfo *ti;
    9521                 :             :         int         datadirpathlen;
    9522                 :             : 
    9523                 :             :         /*
    9524                 :             :          * Force an XLOG file switch before the checkpoint, to ensure that the
    9525                 :             :          * WAL segment the checkpoint is written to doesn't contain pages with
    9526                 :             :          * old timeline IDs.  That would otherwise happen if you called
    9527                 :             :          * pg_backup_start() right after restoring from a PITR archive: the
    9528                 :             :          * first WAL segment containing the startup checkpoint has pages in
    9529                 :             :          * the beginning with the old timeline ID.  That can cause trouble at
    9530                 :             :          * recovery: we won't have a history file covering the old timeline if
    9531                 :             :          * pg_wal directory was not included in the base backup and the WAL
    9532                 :             :          * archive was cleared too before starting the backup.
    9533                 :             :          *
    9534                 :             :          * During recovery, we skip forcing XLOG file switch, which means that
    9535                 :             :          * the backup taken during recovery is not available for the special
    9536                 :             :          * recovery case described above.
    9537                 :             :          */
    9538         [ +  + ]:         177 :         if (!backup_started_in_recovery)
    9539                 :         168 :             RequestXLogSwitch(false);
    9540                 :             : 
    9541                 :             :         do
    9542                 :             :         {
    9543                 :             :             bool        checkpointfpw;
    9544                 :             : 
    9545                 :             :             /*
    9546                 :             :              * Force a CHECKPOINT.  Aside from being necessary to prevent torn
    9547                 :             :              * page problems, this guarantees that two successive backup runs
    9548                 :             :              * will have different checkpoint positions and hence different
    9549                 :             :              * history file names, even if nothing happened in between.
    9550                 :             :              *
    9551                 :             :              * During recovery, establish a restartpoint if possible. We use
    9552                 :             :              * the last restartpoint as the backup starting checkpoint. This
    9553                 :             :              * means that two successive backup runs can have same checkpoint
    9554                 :             :              * positions.
    9555                 :             :              *
    9556                 :             :              * Since the fact that we are executing do_pg_backup_start()
    9557                 :             :              * during recovery means that checkpointer is running, we can use
    9558                 :             :              * RequestCheckpoint() to establish a restartpoint.
    9559                 :             :              *
    9560                 :             :              * We use CHECKPOINT_FAST only if requested by user (via passing
    9561                 :             :              * fast = true).  Otherwise this can take awhile.
    9562                 :             :              */
    9563         [ +  + ]:         177 :             RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
    9564                 :             :                               (fast ? CHECKPOINT_FAST : 0));
    9565                 :             : 
    9566                 :             :             /*
    9567                 :             :              * Now we need to fetch the checkpoint record location, and also
    9568                 :             :              * its REDO pointer.  The oldest point in WAL that would be needed
    9569                 :             :              * to restore starting from the checkpoint is precisely the REDO
    9570                 :             :              * pointer.
    9571                 :             :              */
    9572                 :         177 :             LWLockAcquire(ControlFileLock, LW_SHARED);
    9573                 :         177 :             state->checkpointloc = ControlFile->checkPoint;
    9574                 :         177 :             state->startpoint = ControlFile->checkPointCopy.redo;
    9575                 :         177 :             state->starttli = ControlFile->checkPointCopy.ThisTimeLineID;
    9576                 :         177 :             checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
    9577                 :         177 :             LWLockRelease(ControlFileLock);
    9578                 :             : 
    9579         [ +  + ]:         177 :             if (backup_started_in_recovery)
    9580                 :             :             {
    9581                 :             :                 XLogRecPtr  recptr;
    9582                 :             : 
    9583                 :             :                 /*
    9584                 :             :                  * Check to see if all WAL replayed during online backup
    9585                 :             :                  * (i.e., since last restartpoint used as backup starting
    9586                 :             :                  * checkpoint) contain full-page writes.
    9587                 :             :                  */
    9588                 :           9 :                 SpinLockAcquire(&XLogCtl->info_lck);
    9589                 :           9 :                 recptr = XLogCtl->lastFpwDisableRecPtr;
    9590                 :           9 :                 SpinLockRelease(&XLogCtl->info_lck);
    9591                 :             : 
    9592   [ +  -  -  + ]:           9 :                 if (!checkpointfpw || state->startpoint <= recptr)
    9593         [ #  # ]:           0 :                     ereport(ERROR,
    9594                 :             :                             (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9595                 :             :                              errmsg("WAL generated with \"full_page_writes=off\" was replayed "
    9596                 :             :                                     "since last restartpoint"),
    9597                 :             :                              errhint("This means that the backup being taken on the standby "
    9598                 :             :                                      "is corrupt and should not be used. "
    9599                 :             :                                      "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
    9600                 :             :                                      "and then try an online backup again.")));
    9601                 :             : 
    9602                 :             :                 /*
    9603                 :             :                  * During recovery, since we don't use the end-of-backup WAL
    9604                 :             :                  * record and don't write the backup history file, the
    9605                 :             :                  * starting WAL location doesn't need to be unique. This means
    9606                 :             :                  * that two base backups started at the same time might use
    9607                 :             :                  * the same checkpoint as starting locations.
    9608                 :             :                  */
    9609                 :           9 :                 gotUniqueStartpoint = true;
    9610                 :             :             }
    9611                 :             : 
    9612                 :             :             /*
    9613                 :             :              * If two base backups are started at the same time (in WAL sender
    9614                 :             :              * processes), we need to make sure that they use different
    9615                 :             :              * checkpoints as starting locations, because we use the starting
    9616                 :             :              * WAL location as a unique identifier for the base backup in the
    9617                 :             :              * end-of-backup WAL record and when we write the backup history
    9618                 :             :              * file. Perhaps it would be better generate a separate unique ID
    9619                 :             :              * for each backup instead of forcing another checkpoint, but
    9620                 :             :              * taking a checkpoint right after another is not that expensive
    9621                 :             :              * either because only few buffers have been dirtied yet.
    9622                 :             :              */
    9623                 :         177 :             WALInsertLockAcquireExclusive();
    9624         [ +  - ]:         177 :             if (XLogCtl->Insert.lastBackupStart < state->startpoint)
    9625                 :             :             {
    9626                 :         177 :                 XLogCtl->Insert.lastBackupStart = state->startpoint;
    9627                 :         177 :                 gotUniqueStartpoint = true;
    9628                 :             :             }
    9629                 :         177 :             WALInsertLockRelease();
    9630         [ -  + ]:         177 :         } while (!gotUniqueStartpoint);
    9631                 :             : 
    9632                 :             :         /*
    9633                 :             :          * Construct tablespace_map file.
    9634                 :             :          */
    9635                 :         177 :         datadirpathlen = strlen(DataDir);
    9636                 :             : 
    9637                 :             :         /* Collect information about all tablespaces */
    9638                 :         177 :         tblspcdir = AllocateDir(PG_TBLSPC_DIR);
    9639         [ +  + ]:         568 :         while ((de = ReadDir(tblspcdir, PG_TBLSPC_DIR)) != NULL)
    9640                 :             :         {
    9641                 :             :             char        fullpath[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
    9642                 :             :             char        linkpath[MAXPGPATH];
    9643                 :         391 :             char       *relpath = NULL;
    9644                 :             :             char       *s;
    9645                 :             :             PGFileType  de_type;
    9646                 :             :             char       *badp;
    9647                 :             :             Oid         tsoid;
    9648                 :             : 
    9649                 :             :             /*
    9650                 :             :              * Try to parse the directory name as an unsigned integer.
    9651                 :             :              *
    9652                 :             :              * Tablespace directories should be positive integers that can be
    9653                 :             :              * represented in 32 bits, with no leading zeroes or trailing
    9654                 :             :              * garbage. If we come across a name that doesn't meet those
    9655                 :             :              * criteria, skip it.
    9656                 :             :              */
    9657   [ +  +  -  + ]:         391 :             if (de->d_name[0] < '1' || de->d_name[1] > '9')
    9658                 :         354 :                 continue;
    9659                 :          37 :             errno = 0;
    9660                 :          37 :             tsoid = strtoul(de->d_name, &badp, 10);
    9661   [ +  -  +  -  :          37 :             if (*badp != '\0' || errno == EINVAL || errno == ERANGE)
                   -  + ]
    9662                 :           0 :                 continue;
    9663                 :             : 
    9664                 :          37 :             snprintf(fullpath, sizeof(fullpath), "%s/%s", PG_TBLSPC_DIR, de->d_name);
    9665                 :             : 
    9666                 :          37 :             de_type = get_dirent_type(fullpath, de, false, ERROR);
    9667                 :             : 
    9668         [ +  + ]:          37 :             if (de_type == PGFILETYPE_LNK)
    9669                 :             :             {
    9670                 :             :                 StringInfoData escapedpath;
    9671                 :             :                 int         rllen;
    9672                 :             : 
    9673                 :          23 :                 rllen = readlink(fullpath, linkpath, sizeof(linkpath));
    9674         [ -  + ]:          23 :                 if (rllen < 0)
    9675                 :             :                 {
    9676         [ #  # ]:           0 :                     ereport(WARNING,
    9677                 :             :                             (errmsg("could not read symbolic link \"%s\": %m",
    9678                 :             :                                     fullpath)));
    9679                 :           0 :                     continue;
    9680                 :             :                 }
    9681         [ -  + ]:          23 :                 else if (rllen >= sizeof(linkpath))
    9682                 :             :                 {
    9683         [ #  # ]:           0 :                     ereport(WARNING,
    9684                 :             :                             (errmsg("symbolic link \"%s\" target is too long",
    9685                 :             :                                     fullpath)));
    9686                 :           0 :                     continue;
    9687                 :             :                 }
    9688                 :          23 :                 linkpath[rllen] = '\0';
    9689                 :             : 
    9690                 :             :                 /*
    9691                 :             :                  * Relpath holds the relative path of the tablespace directory
    9692                 :             :                  * when it's located within PGDATA, or NULL if it's located
    9693                 :             :                  * elsewhere.
    9694                 :             :                  */
    9695         [ +  + ]:          23 :                 if (rllen > datadirpathlen &&
    9696         [ -  + ]:           1 :                     strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
    9697         [ #  # ]:           0 :                     IS_DIR_SEP(linkpath[datadirpathlen]))
    9698                 :           0 :                     relpath = pstrdup(linkpath + datadirpathlen + 1);
    9699                 :             : 
    9700                 :             :                 /*
    9701                 :             :                  * Add a backslash-escaped version of the link path to the
    9702                 :             :                  * tablespace map file.
    9703                 :             :                  */
    9704                 :          23 :                 initStringInfo(&escapedpath);
    9705         [ +  + ]:         562 :                 for (s = linkpath; *s; s++)
    9706                 :             :                 {
    9707   [ +  -  +  -  :         539 :                     if (*s == '\n' || *s == '\r' || *s == '\\')
                   -  + ]
    9708                 :           0 :                         appendStringInfoChar(&escapedpath, '\\');
    9709                 :         539 :                     appendStringInfoChar(&escapedpath, *s);
    9710                 :             :                 }
    9711                 :          23 :                 appendStringInfo(tblspcmapfile, "%s %s\n",
    9712                 :          23 :                                  de->d_name, escapedpath.data);
    9713                 :          23 :                 pfree(escapedpath.data);
    9714                 :             :             }
    9715         [ +  - ]:          14 :             else if (de_type == PGFILETYPE_DIR)
    9716                 :             :             {
    9717                 :             :                 /*
    9718                 :             :                  * It's possible to use allow_in_place_tablespaces to create
    9719                 :             :                  * directories directly under pg_tblspc, for testing purposes
    9720                 :             :                  * only.
    9721                 :             :                  *
    9722                 :             :                  * In this case, we store a relative path rather than an
    9723                 :             :                  * absolute path into the tablespaceinfo.
    9724                 :             :                  */
    9725                 :          14 :                 snprintf(linkpath, sizeof(linkpath), "%s/%s",
    9726                 :          14 :                          PG_TBLSPC_DIR, de->d_name);
    9727                 :          14 :                 relpath = pstrdup(linkpath);
    9728                 :             :             }
    9729                 :             :             else
    9730                 :             :             {
    9731                 :             :                 /* Skip any other file type that appears here. */
    9732                 :           0 :                 continue;
    9733                 :             :             }
    9734                 :             : 
    9735                 :          37 :             ti = palloc_object(tablespaceinfo);
    9736                 :          37 :             ti->oid = tsoid;
    9737                 :          37 :             ti->path = pstrdup(linkpath);
    9738                 :          37 :             ti->rpath = relpath;
    9739                 :          37 :             ti->size = -1;
    9740                 :             : 
    9741         [ +  - ]:          37 :             if (tablespaces)
    9742                 :          37 :                 *tablespaces = lappend(*tablespaces, ti);
    9743                 :             :         }
    9744                 :         177 :         FreeDir(tblspcdir);
    9745                 :             : 
    9746                 :         177 :         state->starttime = (pg_time_t) time(NULL);
    9747                 :             :     }
    9748         [ -  + ]:         177 :     PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(true));
    9749                 :             : 
    9750                 :         177 :     state->started_in_recovery = backup_started_in_recovery;
    9751                 :             : 
    9752                 :             :     /*
    9753                 :             :      * Mark that the start phase has correctly finished for the backup.
    9754                 :             :      */
    9755                 :         177 :     sessionBackupState = SESSION_BACKUP_RUNNING;
    9756                 :         177 : }
    9757                 :             : 
    9758                 :             : /*
    9759                 :             :  * Utility routine to fetch the session-level status of a backup running.
    9760                 :             :  */
    9761                 :             : SessionBackupState
    9762                 :         198 : get_backup_status(void)
    9763                 :             : {
    9764                 :         198 :     return sessionBackupState;
    9765                 :             : }
    9766                 :             : 
    9767                 :             : /*
    9768                 :             :  * do_pg_backup_stop
    9769                 :             :  *
    9770                 :             :  * Utility function called at the end of an online backup.  It creates history
    9771                 :             :  * file (if required), resets sessionBackupState and so on.  It can optionally
    9772                 :             :  * wait for WAL segments to be archived.
    9773                 :             :  *
    9774                 :             :  * "state" is filled with the information necessary to restore from this
    9775                 :             :  * backup with its stop LSN (stoppoint), its timeline ID (stoptli), etc.
    9776                 :             :  *
    9777                 :             :  * It is the responsibility of the caller of this function to verify the
    9778                 :             :  * permissions of the calling user!
    9779                 :             :  */
    9780                 :             : void
    9781                 :         171 : do_pg_backup_stop(BackupState *state, bool waitforarchive)
    9782                 :             : {
    9783                 :         171 :     bool        backup_stopped_in_recovery = false;
    9784                 :             :     char        histfilepath[MAXPGPATH];
    9785                 :             :     char        lastxlogfilename[MAXFNAMELEN];
    9786                 :             :     char        histfilename[MAXFNAMELEN];
    9787                 :             :     XLogSegNo   _logSegNo;
    9788                 :             :     FILE       *fp;
    9789                 :             :     int         seconds_before_warning;
    9790                 :         171 :     int         waits = 0;
    9791                 :         171 :     bool        reported_waiting = false;
    9792                 :             : 
    9793                 :             :     Assert(state != NULL);
    9794                 :             : 
    9795                 :         171 :     backup_stopped_in_recovery = RecoveryInProgress();
    9796                 :             : 
    9797                 :             :     /*
    9798                 :             :      * During recovery, we don't need to check WAL level. Because, if WAL
    9799                 :             :      * level is not sufficient, it's impossible to get here during recovery.
    9800                 :             :      */
    9801   [ +  +  -  + ]:         171 :     if (!backup_stopped_in_recovery && !XLogIsNeeded())
    9802         [ #  # ]:           0 :         ereport(ERROR,
    9803                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9804                 :             :                  errmsg("WAL level not sufficient for making an online backup"),
    9805                 :             :                  errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
    9806                 :             : 
    9807                 :             :     /*
    9808                 :             :      * OK to update backup counter and session-level lock.
    9809                 :             :      *
    9810                 :             :      * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them,
    9811                 :             :      * otherwise they can be updated inconsistently, which might cause
    9812                 :             :      * do_pg_abort_backup() to fail.
    9813                 :             :      */
    9814                 :         171 :     WALInsertLockAcquireExclusive();
    9815                 :             : 
    9816                 :             :     /*
    9817                 :             :      * It is expected that each do_pg_backup_start() call is matched by
    9818                 :             :      * exactly one do_pg_backup_stop() call.
    9819                 :             :      */
    9820                 :             :     Assert(XLogCtl->Insert.runningBackups > 0);
    9821                 :         171 :     XLogCtl->Insert.runningBackups--;
    9822                 :             : 
    9823                 :             :     /*
    9824                 :             :      * Clean up session-level lock.
    9825                 :             :      *
    9826                 :             :      * You might think that WALInsertLockRelease() can be called before
    9827                 :             :      * cleaning up session-level lock because session-level lock doesn't need
    9828                 :             :      * to be protected with WAL insertion lock. But since
    9829                 :             :      * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
    9830                 :             :      * cleaned up before it.
    9831                 :             :      */
    9832                 :         171 :     sessionBackupState = SESSION_BACKUP_NONE;
    9833                 :             : 
    9834                 :         171 :     WALInsertLockRelease();
    9835                 :             : 
    9836                 :             :     /*
    9837                 :             :      * If we are taking an online backup from the standby, we confirm that the
    9838                 :             :      * standby has not been promoted during the backup.
    9839                 :             :      */
    9840   [ +  +  -  + ]:         171 :     if (state->started_in_recovery && !backup_stopped_in_recovery)
    9841         [ #  # ]:           0 :         ereport(ERROR,
    9842                 :             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9843                 :             :                  errmsg("the standby was promoted during online backup"),
    9844                 :             :                  errhint("This means that the backup being taken is corrupt "
    9845                 :             :                          "and should not be used. "
    9846                 :             :                          "Try taking another online backup.")));
    9847                 :             : 
    9848                 :             :     /*
    9849                 :             :      * During recovery, we don't write an end-of-backup record. We assume that
    9850                 :             :      * pg_control was backed up last and its minimum recovery point can be
    9851                 :             :      * available as the backup end location. Since we don't have an
    9852                 :             :      * end-of-backup record, we use the pg_control value to check whether
    9853                 :             :      * we've reached the end of backup when starting recovery from this
    9854                 :             :      * backup. We have no way of checking if pg_control wasn't backed up last
    9855                 :             :      * however.
    9856                 :             :      *
    9857                 :             :      * We don't force a switch to new WAL file but it is still possible to
    9858                 :             :      * wait for all the required files to be archived if waitforarchive is
    9859                 :             :      * true. This is okay if we use the backup to start a standby and fetch
    9860                 :             :      * the missing WAL using streaming replication. But in the case of an
    9861                 :             :      * archive recovery, a user should set waitforarchive to true and wait for
    9862                 :             :      * them to be archived to ensure that all the required files are
    9863                 :             :      * available.
    9864                 :             :      *
    9865                 :             :      * We return the current minimum recovery point as the backup end
    9866                 :             :      * location. Note that it can be greater than the exact backup end
    9867                 :             :      * location if the minimum recovery point is updated after the backup of
    9868                 :             :      * pg_control. This is harmless for current uses.
    9869                 :             :      *
    9870                 :             :      * XXX currently a backup history file is for informational and debug
    9871                 :             :      * purposes only. It's not essential for an online backup. Furthermore,
    9872                 :             :      * even if it's created, it will not be archived during recovery because
    9873                 :             :      * an archiver is not invoked. So it doesn't seem worthwhile to write a
    9874                 :             :      * backup history file during recovery.
    9875                 :             :      */
    9876         [ +  + ]:         171 :     if (backup_stopped_in_recovery)
    9877                 :             :     {
    9878                 :             :         XLogRecPtr  recptr;
    9879                 :             : 
    9880                 :             :         /*
    9881                 :             :          * Check to see if all WAL replayed during online backup contain
    9882                 :             :          * full-page writes.
    9883                 :             :          */
    9884                 :           9 :         SpinLockAcquire(&XLogCtl->info_lck);
    9885                 :           9 :         recptr = XLogCtl->lastFpwDisableRecPtr;
    9886                 :           9 :         SpinLockRelease(&XLogCtl->info_lck);
    9887                 :             : 
    9888         [ -  + ]:           9 :         if (state->startpoint <= recptr)
    9889         [ #  # ]:           0 :             ereport(ERROR,
    9890                 :             :                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    9891                 :             :                      errmsg("WAL generated with \"full_page_writes=off\" was replayed "
    9892                 :             :                             "during online backup"),
    9893                 :             :                      errhint("This means that the backup being taken on the standby "
    9894                 :             :                              "is corrupt and should not be used. "
    9895                 :             :                              "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
    9896                 :             :                              "and then try an online backup again.")));
    9897                 :             : 
    9898                 :             : 
    9899                 :           9 :         LWLockAcquire(ControlFileLock, LW_SHARED);
    9900                 :           9 :         state->stoppoint = ControlFile->minRecoveryPoint;
    9901                 :           9 :         state->stoptli = ControlFile->minRecoveryPointTLI;
    9902                 :           9 :         LWLockRelease(ControlFileLock);
    9903                 :             :     }
    9904                 :             :     else
    9905                 :             :     {
    9906                 :             :         char       *history_file;
    9907                 :             : 
    9908                 :             :         /*
    9909                 :             :          * Write the backup-end xlog record
    9910                 :             :          */
    9911                 :         162 :         XLogBeginInsert();
    9912                 :         162 :         XLogRegisterData(&state->startpoint,
    9913                 :             :                          sizeof(state->startpoint));
    9914                 :         162 :         state->stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
    9915                 :             : 
    9916                 :             :         /*
    9917                 :             :          * Given that we're not in recovery, InsertTimeLineID is set and can't
    9918                 :             :          * change, so we can read it without a lock.
    9919                 :             :          */
    9920                 :         162 :         state->stoptli = XLogCtl->InsertTimeLineID;
    9921                 :             : 
    9922                 :             :         /*
    9923                 :             :          * Force a switch to a new xlog segment file, so that the backup is
    9924                 :             :          * valid as soon as archiver moves out the current segment file.
    9925                 :             :          */
    9926                 :         162 :         RequestXLogSwitch(false);
    9927                 :             : 
    9928                 :         162 :         state->stoptime = (pg_time_t) time(NULL);
    9929                 :             : 
    9930                 :             :         /*
    9931                 :             :          * Write the backup history file
    9932                 :             :          */
    9933                 :         162 :         XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
    9934                 :         162 :         BackupHistoryFilePath(histfilepath, state->stoptli, _logSegNo,
    9935                 :             :                               state->startpoint, wal_segment_size);
    9936                 :         162 :         fp = AllocateFile(histfilepath, "w");
    9937         [ -  + ]:         162 :         if (!fp)
    9938         [ #  # ]:           0 :             ereport(ERROR,
    9939                 :             :                     (errcode_for_file_access(),
    9940                 :             :                      errmsg("could not create file \"%s\": %m",
    9941                 :             :                             histfilepath)));
    9942                 :             : 
    9943                 :             :         /* Build and save the contents of the backup history file */
    9944                 :         162 :         history_file = build_backup_content(state, true);
    9945                 :         162 :         fprintf(fp, "%s", history_file);
    9946                 :         162 :         pfree(history_file);
    9947                 :             : 
    9948   [ +  -  +  -  :         162 :         if (fflush(fp) || ferror(fp) || FreeFile(fp))
                   -  + ]
    9949         [ #  # ]:           0 :             ereport(ERROR,
    9950                 :             :                     (errcode_for_file_access(),
    9951                 :             :                      errmsg("could not write file \"%s\": %m",
    9952                 :             :                             histfilepath)));
    9953                 :             : 
    9954                 :             :         /*
    9955                 :             :          * Clean out any no-longer-needed history files.  As a side effect,
    9956                 :             :          * this will post a .ready file for the newly created history file,
    9957                 :             :          * notifying the archiver that history file may be archived
    9958                 :             :          * immediately.
    9959                 :             :          */
    9960                 :         162 :         CleanupBackupHistory();
    9961                 :             :     }
    9962                 :             : 
    9963                 :             :     /*
    9964                 :             :      * If archiving is enabled, wait for all the required WAL files to be
    9965                 :             :      * archived before returning. If archiving isn't enabled, the required WAL
    9966                 :             :      * needs to be transported via streaming replication (hopefully with
    9967                 :             :      * wal_keep_size set high enough), or some more exotic mechanism like
    9968                 :             :      * polling and copying files from pg_wal with script. We have no knowledge
    9969                 :             :      * of those mechanisms, so it's up to the user to ensure that he gets all
    9970                 :             :      * the required WAL.
    9971                 :             :      *
    9972                 :             :      * We wait until both the last WAL file filled during backup and the
    9973                 :             :      * history file have been archived, and assume that the alphabetic sorting
    9974                 :             :      * property of the WAL files ensures any earlier WAL files are safely
    9975                 :             :      * archived as well.
    9976                 :             :      *
    9977                 :             :      * We wait forever, since archive_command is supposed to work and we
    9978                 :             :      * assume the admin wanted his backup to work completely. If you don't
    9979                 :             :      * wish to wait, then either waitforarchive should be passed in as false,
    9980                 :             :      * or you can set statement_timeout.  Also, some notices are issued to
    9981                 :             :      * clue in anyone who might be doing this interactively.
    9982                 :             :      */
    9983                 :             : 
    9984         [ +  + ]:         171 :     if (waitforarchive &&
    9985   [ +  +  +  +  :          10 :         ((!backup_stopped_in_recovery && XLogArchivingActive()) ||
                   +  + ]
    9986         [ -  + ]:           1 :          (backup_stopped_in_recovery && XLogArchivingAlways())))
    9987                 :             :     {
    9988                 :           4 :         XLByteToPrevSeg(state->stoppoint, _logSegNo, wal_segment_size);
    9989                 :           4 :         XLogFileName(lastxlogfilename, state->stoptli, _logSegNo,
    9990                 :             :                      wal_segment_size);
    9991                 :             : 
    9992                 :           4 :         XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
    9993                 :           4 :         BackupHistoryFileName(histfilename, state->stoptli, _logSegNo,
    9994                 :             :                               state->startpoint, wal_segment_size);
    9995                 :             : 
    9996                 :           4 :         seconds_before_warning = 60;
    9997                 :           4 :         waits = 0;
    9998                 :             : 
    9999   [ +  +  -  + ]:          12 :         while (XLogArchiveIsBusy(lastxlogfilename) ||
   10000                 :           4 :                XLogArchiveIsBusy(histfilename))
   10001                 :             :         {
   10002         [ -  + ]:           4 :             CHECK_FOR_INTERRUPTS();
   10003                 :             : 
   10004   [ +  -  -  + ]:           4 :             if (!reported_waiting && waits > 5)
   10005                 :             :             {
   10006         [ #  # ]:           0 :                 ereport(NOTICE,
   10007                 :             :                         (errmsg("base backup done, waiting for required WAL segments to be archived")));
   10008                 :           0 :                 reported_waiting = true;
   10009                 :             :             }
   10010                 :             : 
   10011                 :           4 :             (void) WaitLatch(MyLatch,
   10012                 :             :                              WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
   10013                 :             :                              1000L,
   10014                 :             :                              WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
   10015                 :           4 :             ResetLatch(MyLatch);
   10016                 :             : 
   10017         [ -  + ]:           4 :             if (++waits >= seconds_before_warning)
   10018                 :             :             {
   10019                 :           0 :                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
   10020         [ #  # ]:           0 :                 ereport(WARNING,
   10021                 :             :                         (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
   10022                 :             :                                 waits),
   10023                 :             :                          errhint("Check that your \"archive_command\" is executing properly.  "
   10024                 :             :                                  "You can safely cancel this backup, "
   10025                 :             :                                  "but the database backup will not be usable without all the WAL segments.")));
   10026                 :             :             }
   10027                 :             :         }
   10028                 :             : 
   10029         [ +  + ]:           4 :         ereport(NOTICE,
   10030                 :             :                 (errmsg("all required WAL segments have been archived")));
   10031                 :             :     }
   10032         [ +  + ]:         167 :     else if (waitforarchive)
   10033         [ +  - ]:           6 :         ereport(NOTICE,
   10034                 :             :                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
   10035                 :         171 : }
   10036                 :             : 
   10037                 :             : 
   10038                 :             : /*
   10039                 :             :  * do_pg_abort_backup: abort a running backup
   10040                 :             :  *
   10041                 :             :  * This does just the most basic steps of do_pg_backup_stop(), by taking the
   10042                 :             :  * system out of backup mode, thus making it a lot more safe to call from
   10043                 :             :  * an error handler.
   10044                 :             :  *
   10045                 :             :  * 'arg' indicates that it's being called during backup setup; so
   10046                 :             :  * sessionBackupState has not been modified yet, but runningBackups has
   10047                 :             :  * already been incremented.  When it's false, then it's invoked as a
   10048                 :             :  * before_shmem_exit handler, and therefore we must not change state
   10049                 :             :  * unless sessionBackupState indicates that a backup is actually running.
   10050                 :             :  *
   10051                 :             :  * NB: This gets used as a PG_ENSURE_ERROR_CLEANUP callback and
   10052                 :             :  * before_shmem_exit handler, hence the odd-looking signature.
   10053                 :             :  */
   10054                 :             : void
   10055                 :           8 : do_pg_abort_backup(int code, Datum arg)
   10056                 :             : {
   10057                 :           8 :     bool        during_backup_start = DatumGetBool(arg);
   10058                 :             : 
   10059                 :             :     /* If called during backup start, there shouldn't be one already running */
   10060                 :             :     Assert(!during_backup_start || sessionBackupState == SESSION_BACKUP_NONE);
   10061                 :             : 
   10062   [ +  -  +  + ]:           8 :     if (during_backup_start || sessionBackupState != SESSION_BACKUP_NONE)
   10063                 :             :     {
   10064                 :           6 :         WALInsertLockAcquireExclusive();
   10065                 :             :         Assert(XLogCtl->Insert.runningBackups > 0);
   10066                 :           6 :         XLogCtl->Insert.runningBackups--;
   10067                 :             : 
   10068                 :           6 :         sessionBackupState = SESSION_BACKUP_NONE;
   10069                 :           6 :         WALInsertLockRelease();
   10070                 :             : 
   10071         [ +  - ]:           6 :         if (!during_backup_start)
   10072         [ +  - ]:           6 :             ereport(WARNING,
   10073                 :             :                     errmsg("aborting backup due to backend exiting before pg_backup_stop was called"));
   10074                 :             :     }
   10075                 :           8 : }
   10076                 :             : 
   10077                 :             : /*
   10078                 :             :  * Register a handler that will warn about unterminated backups at end of
   10079                 :             :  * session, unless this has already been done.
   10080                 :             :  */
   10081                 :             : void
   10082                 :           4 : register_persistent_abort_backup_handler(void)
   10083                 :             : {
   10084                 :             :     static bool already_done = false;
   10085                 :             : 
   10086         [ +  + ]:           4 :     if (already_done)
   10087                 :           1 :         return;
   10088                 :           3 :     before_shmem_exit(do_pg_abort_backup, BoolGetDatum(false));
   10089                 :           3 :     already_done = true;
   10090                 :             : }
   10091                 :             : 
   10092                 :             : /*
   10093                 :             :  * Get latest WAL insert pointer
   10094                 :             :  */
   10095                 :             : XLogRecPtr
   10096                 :        2137 : GetXLogInsertRecPtr(void)
   10097                 :             : {
   10098                 :        2137 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
   10099                 :             :     uint64      current_bytepos;
   10100                 :             : 
   10101                 :        2137 :     SpinLockAcquire(&Insert->insertpos_lck);
   10102                 :        2137 :     current_bytepos = Insert->CurrBytePos;
   10103                 :        2137 :     SpinLockRelease(&Insert->insertpos_lck);
   10104                 :             : 
   10105                 :        2137 :     return XLogBytePosToRecPtr(current_bytepos);
   10106                 :             : }
   10107                 :             : 
   10108                 :             : /*
   10109                 :             :  * Get latest WAL record end pointer
   10110                 :             :  */
   10111                 :             : XLogRecPtr
   10112                 :        1676 : GetXLogInsertEndRecPtr(void)
   10113                 :             : {
   10114                 :        1676 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
   10115                 :             :     uint64      current_bytepos;
   10116                 :             : 
   10117                 :        1676 :     SpinLockAcquire(&Insert->insertpos_lck);
   10118                 :        1676 :     current_bytepos = Insert->CurrBytePos;
   10119                 :        1676 :     SpinLockRelease(&Insert->insertpos_lck);
   10120                 :             : 
   10121                 :        1676 :     return XLogBytePosToEndRecPtr(current_bytepos);
   10122                 :             : }
   10123                 :             : 
   10124                 :             : /*
   10125                 :             :  * Get latest WAL write pointer
   10126                 :             :  */
   10127                 :             : XLogRecPtr
   10128                 :        3587 : GetXLogWriteRecPtr(void)
   10129                 :             : {
   10130                 :        3587 :     RefreshXLogWriteResult(LogwrtResult);
   10131                 :             : 
   10132                 :        3587 :     return LogwrtResult.Write;
   10133                 :             : }
   10134                 :             : 
   10135                 :             : /*
   10136                 :             :  * Returns the redo pointer of the last checkpoint or restartpoint. This is
   10137                 :             :  * the oldest point in WAL that we still need, if we have to restart recovery.
   10138                 :             :  */
   10139                 :             : void
   10140                 :         404 : GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
   10141                 :             : {
   10142                 :         404 :     LWLockAcquire(ControlFileLock, LW_SHARED);
   10143                 :         404 :     *oldrecptr = ControlFile->checkPointCopy.redo;
   10144                 :         404 :     *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
   10145                 :         404 :     LWLockRelease(ControlFileLock);
   10146                 :         404 : }
   10147                 :             : 
   10148                 :             : /* Thin wrapper around ShutdownWalRcv(). */
   10149                 :             : void
   10150                 :        1085 : XLogShutdownWalRcv(void)
   10151                 :             : {
   10152                 :             :     Assert(AmStartupProcess() || !IsUnderPostmaster);
   10153                 :             : 
   10154                 :        1085 :     ShutdownWalRcv();
   10155                 :        1085 :     ResetInstallXLogFileSegmentActive();
   10156                 :        1085 : }
   10157                 :             : 
   10158                 :             : /* Enable WAL file recycling and preallocation. */
   10159                 :             : void
   10160                 :        1302 : SetInstallXLogFileSegmentActive(void)
   10161                 :             : {
   10162                 :        1302 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
   10163                 :        1302 :     XLogCtl->InstallXLogFileSegmentActive = true;
   10164                 :        1302 :     LWLockRelease(ControlFileLock);
   10165                 :        1302 : }
   10166                 :             : 
   10167                 :             : /* Disable WAL file recycling and preallocation. */
   10168                 :             : void
   10169                 :        1265 : ResetInstallXLogFileSegmentActive(void)
   10170                 :             : {
   10171                 :        1265 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
   10172                 :        1265 :     XLogCtl->InstallXLogFileSegmentActive = false;
   10173                 :        1265 :     LWLockRelease(ControlFileLock);
   10174                 :        1265 : }
   10175                 :             : 
   10176                 :             : bool
   10177                 :           0 : IsInstallXLogFileSegmentActive(void)
   10178                 :             : {
   10179                 :             :     bool        result;
   10180                 :             : 
   10181                 :           0 :     LWLockAcquire(ControlFileLock, LW_SHARED);
   10182                 :           0 :     result = XLogCtl->InstallXLogFileSegmentActive;
   10183                 :           0 :     LWLockRelease(ControlFileLock);
   10184                 :             : 
   10185                 :           0 :     return result;
   10186                 :             : }
   10187                 :             : 
   10188                 :             : /*
   10189                 :             :  * Update the WalWriterSleeping flag.
   10190                 :             :  */
   10191                 :             : void
   10192                 :         589 : SetWalWriterSleeping(bool sleeping)
   10193                 :             : {
   10194                 :         589 :     SpinLockAcquire(&XLogCtl->info_lck);
   10195                 :         589 :     XLogCtl->WalWriterSleeping = sleeping;
   10196                 :         589 :     SpinLockRelease(&XLogCtl->info_lck);
   10197                 :         589 : }
        

Generated by: LCOV version 2.0-1