Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlog.c
4 : * PostgreSQL write-ahead log manager
5 : *
6 : * The Write-Ahead Log (WAL) functionality is split into several source
7 : * files, in addition to this one:
8 : *
9 : * xloginsert.c - Functions for constructing WAL records
10 : * xlogrecovery.c - WAL recovery and standby code
11 : * xlogreader.c - Facility for reading WAL files and parsing WAL records
12 : * xlogutils.c - Helper functions for WAL redo routines
13 : *
14 : * This file contains functions for coordinating database startup and
15 : * checkpointing, and managing the write-ahead log buffers when the
16 : * system is running.
17 : *
18 : * StartupXLOG() is the main entry point of the startup process. It
19 : * coordinates database startup, performing WAL recovery, and the
20 : * transition from WAL recovery into normal operations.
21 : *
22 : * XLogInsertRecord() inserts a WAL record into the WAL buffers. Most
23 : * callers should not call this directly, but use the functions in
24 : * xloginsert.c to construct the WAL record. XLogFlush() can be used
25 : * to force the WAL to disk.
26 : *
27 : * In addition to those, there are many other functions for interrogating
28 : * the current system state, and for starting/stopping backups.
29 : *
30 : *
31 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
32 : * Portions Copyright (c) 1994, Regents of the University of California
33 : *
34 : * src/backend/access/transam/xlog.c
35 : *
36 : *-------------------------------------------------------------------------
37 : */
38 :
39 : #include "postgres.h"
40 :
41 : #include <ctype.h>
42 : #include <math.h>
43 : #include <time.h>
44 : #include <fcntl.h>
45 : #include <sys/stat.h>
46 : #include <sys/time.h>
47 : #include <unistd.h>
48 :
49 : #include "access/clog.h"
50 : #include "access/commit_ts.h"
51 : #include "access/heaptoast.h"
52 : #include "access/multixact.h"
53 : #include "access/rewriteheap.h"
54 : #include "access/subtrans.h"
55 : #include "access/timeline.h"
56 : #include "access/transam.h"
57 : #include "access/twophase.h"
58 : #include "access/xact.h"
59 : #include "access/xlog_internal.h"
60 : #include "access/xlogarchive.h"
61 : #include "access/xloginsert.h"
62 : #include "access/xlogreader.h"
63 : #include "access/xlogrecovery.h"
64 : #include "access/xlogutils.h"
65 : #include "backup/basebackup.h"
66 : #include "catalog/catversion.h"
67 : #include "catalog/pg_control.h"
68 : #include "catalog/pg_database.h"
69 : #include "common/controldata_utils.h"
70 : #include "common/file_utils.h"
71 : #include "executor/instrument.h"
72 : #include "miscadmin.h"
73 : #include "pg_trace.h"
74 : #include "pgstat.h"
75 : #include "port/atomics.h"
76 : #include "postmaster/bgwriter.h"
77 : #include "postmaster/startup.h"
78 : #include "postmaster/walsummarizer.h"
79 : #include "postmaster/walwriter.h"
80 : #include "replication/origin.h"
81 : #include "replication/slot.h"
82 : #include "replication/snapbuild.h"
83 : #include "replication/walreceiver.h"
84 : #include "replication/walsender.h"
85 : #include "storage/bufmgr.h"
86 : #include "storage/fd.h"
87 : #include "storage/ipc.h"
88 : #include "storage/large_object.h"
89 : #include "storage/latch.h"
90 : #include "storage/predicate.h"
91 : #include "storage/proc.h"
92 : #include "storage/procarray.h"
93 : #include "storage/reinit.h"
94 : #include "storage/spin.h"
95 : #include "storage/sync.h"
96 : #include "utils/guc_hooks.h"
97 : #include "utils/guc_tables.h"
98 : #include "utils/injection_point.h"
99 : #include "utils/pgstat_internal.h"
100 : #include "utils/ps_status.h"
101 : #include "utils/relmapper.h"
102 : #include "utils/snapmgr.h"
103 : #include "utils/timeout.h"
104 : #include "utils/timestamp.h"
105 : #include "utils/varlena.h"
106 :
107 : #ifdef WAL_DEBUG
108 : #include "utils/memutils.h"
109 : #endif
110 :
111 : /* timeline ID to be used when bootstrapping */
112 : #define BootstrapTimeLineID 1
113 :
114 : /* User-settable parameters */
115 : int max_wal_size_mb = 1024; /* 1 GB */
116 : int min_wal_size_mb = 80; /* 80 MB */
117 : int wal_keep_size_mb = 0;
118 : int XLOGbuffers = -1;
119 : int XLogArchiveTimeout = 0;
120 : int XLogArchiveMode = ARCHIVE_MODE_OFF;
121 : char *XLogArchiveCommand = NULL;
122 : bool EnableHotStandby = false;
123 : bool fullPageWrites = true;
124 : bool wal_log_hints = false;
125 : int wal_compression = WAL_COMPRESSION_NONE;
126 : char *wal_consistency_checking_string = NULL;
127 : bool *wal_consistency_checking = NULL;
128 : bool wal_init_zero = true;
129 : bool wal_recycle = true;
130 : bool log_checkpoints = true;
131 : int wal_sync_method = DEFAULT_WAL_SYNC_METHOD;
132 : int wal_level = WAL_LEVEL_REPLICA;
133 : int CommitDelay = 0; /* precommit delay in microseconds */
134 : int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
135 : int wal_retrieve_retry_interval = 5000;
136 : int max_slot_wal_keep_size_mb = -1;
137 : int wal_decode_buffer_size = 512 * 1024;
138 : bool track_wal_io_timing = false;
139 :
140 : #ifdef WAL_DEBUG
141 : bool XLOG_DEBUG = false;
142 : #endif
143 :
144 : int wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
145 :
146 : /*
147 : * Number of WAL insertion locks to use. A higher value allows more insertions
148 : * to happen concurrently, but adds some CPU overhead to flushing the WAL,
149 : * which needs to iterate all the locks.
150 : */
151 : #define NUM_XLOGINSERT_LOCKS 8
152 :
153 : /*
154 : * Max distance from last checkpoint, before triggering a new xlog-based
155 : * checkpoint.
156 : */
157 : int CheckPointSegments;
158 :
159 : /* Estimated distance between checkpoints, in bytes */
160 : static double CheckPointDistanceEstimate = 0;
161 : static double PrevCheckPointDistance = 0;
162 :
163 : /*
164 : * Track whether there were any deferred checks for custom resource managers
165 : * specified in wal_consistency_checking.
166 : */
167 : static bool check_wal_consistency_checking_deferred = false;
168 :
169 : /*
170 : * GUC support
171 : */
172 : const struct config_enum_entry wal_sync_method_options[] = {
173 : {"fsync", WAL_SYNC_METHOD_FSYNC, false},
174 : #ifdef HAVE_FSYNC_WRITETHROUGH
175 : {"fsync_writethrough", WAL_SYNC_METHOD_FSYNC_WRITETHROUGH, false},
176 : #endif
177 : {"fdatasync", WAL_SYNC_METHOD_FDATASYNC, false},
178 : #ifdef O_SYNC
179 : {"open_sync", WAL_SYNC_METHOD_OPEN, false},
180 : #endif
181 : #ifdef O_DSYNC
182 : {"open_datasync", WAL_SYNC_METHOD_OPEN_DSYNC, false},
183 : #endif
184 : {NULL, 0, false}
185 : };
186 :
187 :
188 : /*
189 : * Although only "on", "off", and "always" are documented,
190 : * we accept all the likely variants of "on" and "off".
191 : */
192 : const struct config_enum_entry archive_mode_options[] = {
193 : {"always", ARCHIVE_MODE_ALWAYS, false},
194 : {"on", ARCHIVE_MODE_ON, false},
195 : {"off", ARCHIVE_MODE_OFF, false},
196 : {"true", ARCHIVE_MODE_ON, true},
197 : {"false", ARCHIVE_MODE_OFF, true},
198 : {"yes", ARCHIVE_MODE_ON, true},
199 : {"no", ARCHIVE_MODE_OFF, true},
200 : {"1", ARCHIVE_MODE_ON, true},
201 : {"0", ARCHIVE_MODE_OFF, true},
202 : {NULL, 0, false}
203 : };
204 :
205 : /*
206 : * Statistics for current checkpoint are collected in this global struct.
207 : * Because only the checkpointer or a stand-alone backend can perform
208 : * checkpoints, this will be unused in normal backends.
209 : */
210 : CheckpointStatsData CheckpointStats;
211 :
212 : /*
213 : * During recovery, lastFullPageWrites keeps track of full_page_writes that
214 : * the replayed WAL records indicate. It's initialized with full_page_writes
215 : * that the recovery starting checkpoint record indicates, and then updated
216 : * each time XLOG_FPW_CHANGE record is replayed.
217 : */
218 : static bool lastFullPageWrites;
219 :
220 : /*
221 : * Local copy of the state tracked by SharedRecoveryState in shared memory,
222 : * It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually
223 : * means "not known, need to check the shared state".
224 : */
225 : static bool LocalRecoveryInProgress = true;
226 :
227 : /*
228 : * Local state for XLogInsertAllowed():
229 : * 1: unconditionally allowed to insert XLOG
230 : * 0: unconditionally not allowed to insert XLOG
231 : * -1: must check RecoveryInProgress(); disallow until it is false
232 : * Most processes start with -1 and transition to 1 after seeing that recovery
233 : * is not in progress. But we can also force the value for special cases.
234 : * The coding in XLogInsertAllowed() depends on the first two of these states
235 : * being numerically the same as bool true and false.
236 : */
237 : static int LocalXLogInsertAllowed = -1;
238 :
239 : /*
240 : * ProcLastRecPtr points to the start of the last XLOG record inserted by the
241 : * current backend. It is updated for all inserts. XactLastRecEnd points to
242 : * end+1 of the last record, and is reset when we end a top-level transaction,
243 : * or start a new one; so it can be used to tell if the current transaction has
244 : * created any XLOG records.
245 : *
246 : * While in parallel mode, this may not be fully up to date. When committing,
247 : * a transaction can assume this covers all xlog records written either by the
248 : * user backend or by any parallel worker which was present at any point during
249 : * the transaction. But when aborting, or when still in parallel mode, other
250 : * parallel backends may have written WAL records at later LSNs than the value
251 : * stored here. The parallel leader advances its own copy, when necessary,
252 : * in WaitForParallelWorkersToFinish.
253 : */
254 : XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
255 : XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
256 : XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr;
257 :
258 : /*
259 : * RedoRecPtr is this backend's local copy of the REDO record pointer
260 : * (which is almost but not quite the same as a pointer to the most recent
261 : * CHECKPOINT record). We update this from the shared-memory copy,
262 : * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
263 : * hold an insertion lock). See XLogInsertRecord for details. We are also
264 : * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
265 : * see GetRedoRecPtr.
266 : *
267 : * NB: Code that uses this variable must be prepared not only for the
268 : * possibility that it may be arbitrarily out of date, but also for the
269 : * possibility that it might be set to InvalidXLogRecPtr. We used to
270 : * initialize it as a side effect of the first call to RecoveryInProgress(),
271 : * which meant that most code that might use it could assume that it had a
272 : * real if perhaps stale value. That's no longer the case.
273 : */
274 : static XLogRecPtr RedoRecPtr;
275 :
276 : /*
277 : * doPageWrites is this backend's local copy of (fullPageWrites ||
278 : * runningBackups > 0). It is used together with RedoRecPtr to decide whether
279 : * a full-page image of a page need to be taken.
280 : *
281 : * NB: Initially this is false, and there's no guarantee that it will be
282 : * initialized to any other value before it is first used. Any code that
283 : * makes use of it must recheck the value after obtaining a WALInsertLock,
284 : * and respond appropriately if it turns out that the previous value wasn't
285 : * accurate.
286 : */
287 : static bool doPageWrites;
288 :
289 : /*----------
290 : * Shared-memory data structures for XLOG control
291 : *
292 : * LogwrtRqst indicates a byte position that we need to write and/or fsync
293 : * the log up to (all records before that point must be written or fsynced).
294 : * The positions already written/fsynced are maintained in logWriteResult
295 : * and logFlushResult using atomic access.
296 : * In addition to the shared variable, each backend has a private copy of
297 : * both in LogwrtResult, which is updated when convenient.
298 : *
299 : * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
300 : * (protected by info_lck), but we don't need to cache any copies of it.
301 : *
302 : * info_lck is only held long enough to read/update the protected variables,
303 : * so it's a plain spinlock. The other locks are held longer (potentially
304 : * over I/O operations), so we use LWLocks for them. These locks are:
305 : *
306 : * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
307 : * XLogFlush).
308 : *
309 : * ControlFileLock: must be held to read/update control file or create
310 : * new log file.
311 : *
312 : *----------
313 : */
314 :
315 : typedef struct XLogwrtRqst
316 : {
317 : XLogRecPtr Write; /* last byte + 1 to write out */
318 : XLogRecPtr Flush; /* last byte + 1 to flush */
319 : } XLogwrtRqst;
320 :
321 : typedef struct XLogwrtResult
322 : {
323 : XLogRecPtr Write; /* last byte + 1 written out */
324 : XLogRecPtr Flush; /* last byte + 1 flushed */
325 : } XLogwrtResult;
326 :
327 : /*
328 : * Inserting to WAL is protected by a small fixed number of WAL insertion
329 : * locks. To insert to the WAL, you must hold one of the locks - it doesn't
330 : * matter which one. To lock out other concurrent insertions, you must hold
331 : * of them. Each WAL insertion lock consists of a lightweight lock, plus an
332 : * indicator of how far the insertion has progressed (insertingAt).
333 : *
334 : * The insertingAt values are read when a process wants to flush WAL from
335 : * the in-memory buffers to disk, to check that all the insertions to the
336 : * region the process is about to write out have finished. You could simply
337 : * wait for all currently in-progress insertions to finish, but the
338 : * insertingAt indicator allows you to ignore insertions to later in the WAL,
339 : * so that you only wait for the insertions that are modifying the buffers
340 : * you're about to write out.
341 : *
342 : * This isn't just an optimization. If all the WAL buffers are dirty, an
343 : * inserter that's holding a WAL insert lock might need to evict an old WAL
344 : * buffer, which requires flushing the WAL. If it's possible for an inserter
345 : * to block on another inserter unnecessarily, deadlock can arise when two
346 : * inserters holding a WAL insert lock wait for each other to finish their
347 : * insertion.
348 : *
349 : * Small WAL records that don't cross a page boundary never update the value,
350 : * the WAL record is just copied to the page and the lock is released. But
351 : * to avoid the deadlock-scenario explained above, the indicator is always
352 : * updated before sleeping while holding an insertion lock.
353 : *
354 : * lastImportantAt contains the LSN of the last important WAL record inserted
355 : * using a given lock. This value is used to detect if there has been
356 : * important WAL activity since the last time some action, like a checkpoint,
357 : * was performed - allowing to not repeat the action if not. The LSN is
358 : * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
359 : * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
360 : * records. Tracking the WAL activity directly in WALInsertLock has the
361 : * advantage of not needing any additional locks to update the value.
362 : */
363 : typedef struct
364 : {
365 : LWLock lock;
366 : pg_atomic_uint64 insertingAt;
367 : XLogRecPtr lastImportantAt;
368 : } WALInsertLock;
369 :
370 : /*
371 : * All the WAL insertion locks are allocated as an array in shared memory. We
372 : * force the array stride to be a power of 2, which saves a few cycles in
373 : * indexing, but more importantly also ensures that individual slots don't
374 : * cross cache line boundaries. (Of course, we have to also ensure that the
375 : * array start address is suitably aligned.)
376 : */
377 : typedef union WALInsertLockPadded
378 : {
379 : WALInsertLock l;
380 : char pad[PG_CACHE_LINE_SIZE];
381 : } WALInsertLockPadded;
382 :
383 : /*
384 : * Session status of running backup, used for sanity checks in SQL-callable
385 : * functions to start and stop backups.
386 : */
387 : static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
388 :
389 : /*
390 : * Shared state data for WAL insertion.
391 : */
392 : typedef struct XLogCtlInsert
393 : {
394 : slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
395 :
396 : /*
397 : * CurrBytePos is the end of reserved WAL. The next record will be
398 : * inserted at that position. PrevBytePos is the start position of the
399 : * previously inserted (or rather, reserved) record - it is copied to the
400 : * prev-link of the next record. These are stored as "usable byte
401 : * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
402 : */
403 : uint64 CurrBytePos;
404 : uint64 PrevBytePos;
405 :
406 : /*
407 : * Make sure the above heavily-contended spinlock and byte positions are
408 : * on their own cache line. In particular, the RedoRecPtr and full page
409 : * write variables below should be on a different cache line. They are
410 : * read on every WAL insertion, but updated rarely, and we don't want
411 : * those reads to steal the cache line containing Curr/PrevBytePos.
412 : */
413 : char pad[PG_CACHE_LINE_SIZE];
414 :
415 : /*
416 : * fullPageWrites is the authoritative value used by all backends to
417 : * determine whether to write full-page image to WAL. This shared value,
418 : * instead of the process-local fullPageWrites, is required because, when
419 : * full_page_writes is changed by SIGHUP, we must WAL-log it before it
420 : * actually affects WAL-logging by backends. Checkpointer sets at startup
421 : * or after SIGHUP.
422 : *
423 : * To read these fields, you must hold an insertion lock. To modify them,
424 : * you must hold ALL the locks.
425 : */
426 : XLogRecPtr RedoRecPtr; /* current redo point for insertions */
427 : bool fullPageWrites;
428 :
429 : /*
430 : * runningBackups is a counter indicating the number of backups currently
431 : * in progress. lastBackupStart is the latest checkpoint redo location
432 : * used as a starting point for an online backup.
433 : */
434 : int runningBackups;
435 : XLogRecPtr lastBackupStart;
436 :
437 : /*
438 : * WAL insertion locks.
439 : */
440 : WALInsertLockPadded *WALInsertLocks;
441 : } XLogCtlInsert;
442 :
443 : /*
444 : * Total shared-memory state for XLOG.
445 : */
446 : typedef struct XLogCtlData
447 : {
448 : XLogCtlInsert Insert;
449 :
450 : /* Protected by info_lck: */
451 : XLogwrtRqst LogwrtRqst;
452 : XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
453 : XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
454 : XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
455 :
456 : XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
457 :
458 : /* Fake LSN counter, for unlogged relations. */
459 : pg_atomic_uint64 unloggedLSN;
460 :
461 : /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
462 : pg_time_t lastSegSwitchTime;
463 : XLogRecPtr lastSegSwitchLSN;
464 :
465 : /* These are accessed using atomics -- info_lck not needed */
466 : pg_atomic_uint64 logInsertResult; /* last byte + 1 inserted to buffers */
467 : pg_atomic_uint64 logWriteResult; /* last byte + 1 written out */
468 : pg_atomic_uint64 logFlushResult; /* last byte + 1 flushed */
469 :
470 : /*
471 : * First initialized page in the cache (first byte position).
472 : */
473 : XLogRecPtr InitializedFrom;
474 :
475 : /*
476 : * Latest reserved for initialization page in the cache (last byte
477 : * position + 1).
478 : *
479 : * To change the identity of a buffer, you need to advance
480 : * InitializeReserved first. To change the identity of a buffer that's
481 : * still dirty, the old page needs to be written out first, and for that
482 : * you need WALWriteLock, and you need to ensure that there are no
483 : * in-progress insertions to the page by calling
484 : * WaitXLogInsertionsToFinish().
485 : */
486 : pg_atomic_uint64 InitializeReserved;
487 :
488 : /*
489 : * Latest initialized page in the cache (last byte position + 1).
490 : *
491 : * InitializedUpTo is updated after the buffer initialization. After
492 : * update, waiters got notification using InitializedUpToCondVar.
493 : */
494 : pg_atomic_uint64 InitializedUpTo;
495 : ConditionVariable InitializedUpToCondVar;
496 :
497 : /*
498 : * These values do not change after startup, although the pointed-to pages
499 : * and xlblocks values certainly do. xlblocks values are changed
500 : * lock-free according to the check for the xlog write position and are
501 : * accompanied by changes of InitializeReserved and InitializedUpTo.
502 : */
503 : char *pages; /* buffers for unwritten XLOG pages */
504 : pg_atomic_uint64 *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
505 : int XLogCacheBlck; /* highest allocated xlog buffer index */
506 :
507 : /*
508 : * InsertTimeLineID is the timeline into which new WAL is being inserted
509 : * and flushed. It is zero during recovery, and does not change once set.
510 : *
511 : * If we create a new timeline when the system was started up,
512 : * PrevTimeLineID is the old timeline's ID that we forked off from.
513 : * Otherwise it's equal to InsertTimeLineID.
514 : *
515 : * We set these fields while holding info_lck. Most that reads these
516 : * values knows that recovery is no longer in progress and so can safely
517 : * read the value without a lock, but code that could be run either during
518 : * or after recovery can take info_lck while reading these values.
519 : */
520 : TimeLineID InsertTimeLineID;
521 : TimeLineID PrevTimeLineID;
522 :
523 : /*
524 : * SharedRecoveryState indicates if we're still in crash or archive
525 : * recovery. Protected by info_lck.
526 : */
527 : RecoveryState SharedRecoveryState;
528 :
529 : /*
530 : * InstallXLogFileSegmentActive indicates whether the checkpointer should
531 : * arrange for future segments by recycling and/or PreallocXlogFiles().
532 : * Protected by ControlFileLock. Only the startup process changes it. If
533 : * true, anyone can use InstallXLogFileSegment(). If false, the startup
534 : * process owns the exclusive right to install segments, by reading from
535 : * the archive and possibly replacing existing files.
536 : */
537 : bool InstallXLogFileSegmentActive;
538 :
539 : /*
540 : * WalWriterSleeping indicates whether the WAL writer is currently in
541 : * low-power mode (and hence should be nudged if an async commit occurs).
542 : * Protected by info_lck.
543 : */
544 : bool WalWriterSleeping;
545 :
546 : /*
547 : * During recovery, we keep a copy of the latest checkpoint record here.
548 : * lastCheckPointRecPtr points to start of checkpoint record and
549 : * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the
550 : * checkpointer when it wants to create a restartpoint.
551 : *
552 : * Protected by info_lck.
553 : */
554 : XLogRecPtr lastCheckPointRecPtr;
555 : XLogRecPtr lastCheckPointEndPtr;
556 : CheckPoint lastCheckPoint;
557 :
558 : /*
559 : * lastFpwDisableRecPtr points to the start of the last replayed
560 : * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
561 : */
562 : XLogRecPtr lastFpwDisableRecPtr;
563 :
564 : slock_t info_lck; /* locks shared variables shown above */
565 : } XLogCtlData;
566 :
567 : /*
568 : * Classification of XLogInsertRecord operations.
569 : */
570 : typedef enum
571 : {
572 : WALINSERT_NORMAL,
573 : WALINSERT_SPECIAL_SWITCH,
574 : WALINSERT_SPECIAL_CHECKPOINT
575 : } WalInsertClass;
576 :
577 : static XLogCtlData *XLogCtl = NULL;
578 :
579 : /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
580 : static WALInsertLockPadded *WALInsertLocks = NULL;
581 :
582 : /*
583 : * We maintain an image of pg_control in shared memory.
584 : */
585 : static ControlFileData *ControlFile = NULL;
586 :
587 : /*
588 : * Calculate the amount of space left on the page after 'endptr'. Beware
589 : * multiple evaluation!
590 : */
591 : #define INSERT_FREESPACE(endptr) \
592 : (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
593 :
594 : /* Macro to advance to next buffer index. */
595 : #define NextBufIdx(idx) \
596 : (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
597 :
598 : /*
599 : * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
600 : * would hold if it was in cache, the page containing 'recptr'.
601 : */
602 : #define XLogRecPtrToBufIdx(recptr) \
603 : (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
604 :
605 : /*
606 : * These are the number of bytes in a WAL page usable for WAL data.
607 : */
608 : #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
609 :
610 : /*
611 : * Convert values of GUCs measured in megabytes to equiv. segment count.
612 : * Rounds down.
613 : */
614 : #define ConvertToXSegs(x, segsize) XLogMBVarToSegs((x), (segsize))
615 :
616 : /* The number of bytes in a WAL segment usable for WAL data. */
617 : static int UsableBytesInSegment;
618 :
619 : /*
620 : * Private, possibly out-of-date copy of shared LogwrtResult.
621 : * See discussion above.
622 : */
623 : static XLogwrtResult LogwrtResult = {0, 0};
624 :
625 : /*
626 : * Update local copy of shared XLogCtl->log{Write,Flush}Result
627 : *
628 : * It's critical that Flush always trails Write, so the order of the reads is
629 : * important, as is the barrier. See also XLogWrite.
630 : */
631 : #define RefreshXLogWriteResult(_target) \
632 : do { \
633 : _target.Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult); \
634 : pg_read_barrier(); \
635 : _target.Write = pg_atomic_read_u64(&XLogCtl->logWriteResult); \
636 : } while (0)
637 :
638 : /*
639 : * openLogFile is -1 or a kernel FD for an open log file segment.
640 : * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
641 : * These variables are only used to write the XLOG, and so will normally refer
642 : * to the active segment.
643 : *
644 : * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
645 : */
646 : static int openLogFile = -1;
647 : static XLogSegNo openLogSegNo = 0;
648 : static TimeLineID openLogTLI = 0;
649 :
650 : /*
651 : * Local copies of equivalent fields in the control file. When running
652 : * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
653 : * expect to replay all the WAL available, and updateMinRecoveryPoint is
654 : * switched to false to prevent any updates while replaying records.
655 : * Those values are kept consistent as long as crash recovery runs.
656 : */
657 : static XLogRecPtr LocalMinRecoveryPoint;
658 : static TimeLineID LocalMinRecoveryPointTLI;
659 : static bool updateMinRecoveryPoint = true;
660 :
661 : /* For WALInsertLockAcquire/Release functions */
662 : static int MyLockNo = 0;
663 : static bool holdingAllLocks = false;
664 :
665 : #ifdef WAL_DEBUG
666 : static MemoryContext walDebugCxt = NULL;
667 : #endif
668 :
669 : static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
670 : XLogRecPtr EndOfLog,
671 : TimeLineID newTLI);
672 : static void CheckRequiredParameterValues(void);
673 : static void XLogReportParameters(void);
674 : static int LocalSetXLogInsertAllowed(void);
675 : static void CreateEndOfRecoveryRecord(void);
676 : static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
677 : XLogRecPtr pagePtr,
678 : TimeLineID newTLI);
679 : static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
680 : static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
681 : static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
682 :
683 : static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
684 : bool opportunistic);
685 : static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
686 : static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
687 : bool find_free, XLogSegNo max_segno,
688 : TimeLineID tli);
689 : static void XLogFileClose(void);
690 : static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
691 : static void RemoveTempXlogFiles(void);
692 : static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
693 : XLogRecPtr endptr, TimeLineID insertTLI);
694 : static void RemoveXlogFile(const struct dirent *segment_de,
695 : XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
696 : TimeLineID insertTLI);
697 : static void UpdateLastRemovedPtr(char *filename);
698 : static void ValidateXLOGDirectoryStructure(void);
699 : static void CleanupBackupHistory(void);
700 : static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
701 : static bool PerformRecoveryXLogAction(void);
702 : static void InitControlFile(uint64 sysidentifier, uint32 data_checksum_version);
703 : static void WriteControlFile(void);
704 : static void ReadControlFile(void);
705 : static void UpdateControlFile(void);
706 : static char *str_time(pg_time_t tnow, char *buf, size_t bufsize);
707 :
708 : static int get_sync_bit(int method);
709 :
710 : static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
711 : XLogRecData *rdata,
712 : XLogRecPtr StartPos, XLogRecPtr EndPos,
713 : TimeLineID tli);
714 : static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
715 : XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
716 : static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
717 : XLogRecPtr *PrevPtr);
718 : static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
719 : static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
720 : static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
721 : static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
722 : static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
723 :
724 : static void WALInsertLockAcquire(void);
725 : static void WALInsertLockAcquireExclusive(void);
726 : static void WALInsertLockRelease(void);
727 : static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
728 :
729 : /*
730 : * Insert an XLOG record represented by an already-constructed chain of data
731 : * chunks. This is a low-level routine; to construct the WAL record header
732 : * and data, use the higher-level routines in xloginsert.c.
733 : *
734 : * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
735 : * WAL record applies to, that were not included in the record as full page
736 : * images. If fpw_lsn <= RedoRecPtr, the function does not perform the
737 : * insertion and returns InvalidXLogRecPtr. The caller can then recalculate
738 : * which pages need a full-page image, and retry. If fpw_lsn is invalid, the
739 : * record is always inserted.
740 : *
741 : * 'flags' gives more in-depth control on the record being inserted. See
742 : * XLogSetRecordFlags() for details.
743 : *
744 : * 'topxid_included' tells whether the top-transaction id is logged along with
745 : * current subtransaction. See XLogRecordAssemble().
746 : *
747 : * The first XLogRecData in the chain must be for the record header, and its
748 : * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
749 : * xl_crc fields in the header, the rest of the header must already be filled
750 : * by the caller.
751 : *
752 : * Returns XLOG pointer to end of record (beginning of next record).
753 : * This can be used as LSN for data pages affected by the logged action.
754 : * (LSN is the XLOG point up to which the XLOG must be flushed to disk
755 : * before the data page can be written out. This implements the basic
756 : * WAL rule "write the log before the data".)
757 : */
758 : XLogRecPtr
759 29081848 : XLogInsertRecord(XLogRecData *rdata,
760 : XLogRecPtr fpw_lsn,
761 : uint8 flags,
762 : int num_fpi,
763 : bool topxid_included)
764 : {
765 29081848 : XLogCtlInsert *Insert = &XLogCtl->Insert;
766 : pg_crc32c rdata_crc;
767 : bool inserted;
768 29081848 : XLogRecord *rechdr = (XLogRecord *) rdata->data;
769 29081848 : uint8 info = rechdr->xl_info & ~XLR_INFO_MASK;
770 29081848 : WalInsertClass class = WALINSERT_NORMAL;
771 : XLogRecPtr StartPos;
772 : XLogRecPtr EndPos;
773 29081848 : bool prevDoPageWrites = doPageWrites;
774 : TimeLineID insertTLI;
775 :
776 : /* Does this record type require special handling? */
777 29081848 : if (unlikely(rechdr->xl_rmid == RM_XLOG_ID))
778 : {
779 437904 : if (info == XLOG_SWITCH)
780 1546 : class = WALINSERT_SPECIAL_SWITCH;
781 436358 : else if (info == XLOG_CHECKPOINT_REDO)
782 1788 : class = WALINSERT_SPECIAL_CHECKPOINT;
783 : }
784 :
785 : /* we assume that all of the record header is in the first chunk */
786 : Assert(rdata->len >= SizeOfXLogRecord);
787 :
788 : /* cross-check on whether we should be here or not */
789 29081848 : if (!XLogInsertAllowed())
790 0 : elog(ERROR, "cannot make new WAL entries during recovery");
791 :
792 : /*
793 : * Given that we're not in recovery, InsertTimeLineID is set and can't
794 : * change, so we can read it without a lock.
795 : */
796 29081848 : insertTLI = XLogCtl->InsertTimeLineID;
797 :
798 : /*----------
799 : *
800 : * We have now done all the preparatory work we can without holding a
801 : * lock or modifying shared state. From here on, inserting the new WAL
802 : * record to the shared WAL buffer cache is a two-step process:
803 : *
804 : * 1. Reserve the right amount of space from the WAL. The current head of
805 : * reserved space is kept in Insert->CurrBytePos, and is protected by
806 : * insertpos_lck.
807 : *
808 : * 2. Copy the record to the reserved WAL space. This involves finding the
809 : * correct WAL buffer containing the reserved space, and copying the
810 : * record in place. This can be done concurrently in multiple processes.
811 : *
812 : * To keep track of which insertions are still in-progress, each concurrent
813 : * inserter acquires an insertion lock. In addition to just indicating that
814 : * an insertion is in progress, the lock tells others how far the inserter
815 : * has progressed. There is a small fixed number of insertion locks,
816 : * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
817 : * boundary, it updates the value stored in the lock to the how far it has
818 : * inserted, to allow the previous buffer to be flushed.
819 : *
820 : * Holding onto an insertion lock also protects RedoRecPtr and
821 : * fullPageWrites from changing until the insertion is finished.
822 : *
823 : * Step 2 can usually be done completely in parallel. If the required WAL
824 : * page is not initialized yet, you have to go through AdvanceXLInsertBuffer,
825 : * which will ensure it is initialized. But the WAL writer tries to do that
826 : * ahead of insertions to avoid that from happening in the critical path.
827 : *
828 : *----------
829 : */
830 29081848 : START_CRIT_SECTION();
831 :
832 29081848 : if (likely(class == WALINSERT_NORMAL))
833 : {
834 29078514 : WALInsertLockAcquire();
835 :
836 : /*
837 : * Check to see if my copy of RedoRecPtr is out of date. If so, may
838 : * have to go back and have the caller recompute everything. This can
839 : * only happen just after a checkpoint, so it's better to be slow in
840 : * this case and fast otherwise.
841 : *
842 : * Also check to see if fullPageWrites was just turned on or there's a
843 : * running backup (which forces full-page writes); if we weren't
844 : * already doing full-page writes then go back and recompute.
845 : *
846 : * If we aren't doing full-page writes then RedoRecPtr doesn't
847 : * actually affect the contents of the XLOG record, so we'll update
848 : * our local copy but not force a recomputation. (If doPageWrites was
849 : * just turned off, we could recompute the record without full pages,
850 : * but we choose not to bother.)
851 : */
852 29078514 : if (RedoRecPtr != Insert->RedoRecPtr)
853 : {
854 : Assert(RedoRecPtr < Insert->RedoRecPtr);
855 13608 : RedoRecPtr = Insert->RedoRecPtr;
856 : }
857 29078514 : doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0);
858 :
859 29078514 : if (doPageWrites &&
860 28594840 : (!prevDoPageWrites ||
861 26253554 : (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
862 : {
863 : /*
864 : * Oops, some buffer now needs to be backed up that the caller
865 : * didn't back up. Start over.
866 : */
867 15014 : WALInsertLockRelease();
868 15014 : END_CRIT_SECTION();
869 15014 : return InvalidXLogRecPtr;
870 : }
871 :
872 : /*
873 : * Reserve space for the record in the WAL. This also sets the xl_prev
874 : * pointer.
875 : */
876 29063500 : ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
877 : &rechdr->xl_prev);
878 :
879 : /* Normal records are always inserted. */
880 29063500 : inserted = true;
881 : }
882 3334 : else if (class == WALINSERT_SPECIAL_SWITCH)
883 : {
884 : /*
885 : * In order to insert an XLOG_SWITCH record, we need to hold all of
886 : * the WAL insertion locks, not just one, so that no one else can
887 : * begin inserting a record until we've figured out how much space
888 : * remains in the current WAL segment and claimed all of it.
889 : *
890 : * Nonetheless, this case is simpler than the normal cases handled
891 : * below, which must check for changes in doPageWrites and RedoRecPtr.
892 : * Those checks are only needed for records that can contain buffer
893 : * references, and an XLOG_SWITCH record never does.
894 : */
895 : Assert(fpw_lsn == InvalidXLogRecPtr);
896 1546 : WALInsertLockAcquireExclusive();
897 1546 : inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
898 : }
899 : else
900 : {
901 : Assert(class == WALINSERT_SPECIAL_CHECKPOINT);
902 :
903 : /*
904 : * We need to update both the local and shared copies of RedoRecPtr,
905 : * which means that we need to hold all the WAL insertion locks.
906 : * However, there can't be any buffer references, so as above, we need
907 : * not check RedoRecPtr before inserting the record; we just need to
908 : * update it afterwards.
909 : */
910 : Assert(fpw_lsn == InvalidXLogRecPtr);
911 1788 : WALInsertLockAcquireExclusive();
912 1788 : ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
913 : &rechdr->xl_prev);
914 1788 : RedoRecPtr = Insert->RedoRecPtr = StartPos;
915 1788 : inserted = true;
916 : }
917 :
918 29066834 : if (inserted)
919 : {
920 : /*
921 : * Now that xl_prev has been filled in, calculate CRC of the record
922 : * header.
923 : */
924 29066722 : rdata_crc = rechdr->xl_crc;
925 29066722 : COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
926 29066722 : FIN_CRC32C(rdata_crc);
927 29066722 : rechdr->xl_crc = rdata_crc;
928 :
929 : /*
930 : * All the record data, including the header, is now ready to be
931 : * inserted. Copy the record in the space reserved.
932 : */
933 29066722 : CopyXLogRecordToWAL(rechdr->xl_tot_len,
934 : class == WALINSERT_SPECIAL_SWITCH, rdata,
935 : StartPos, EndPos, insertTLI);
936 :
937 : /*
938 : * Unless record is flagged as not important, update LSN of last
939 : * important record in the current slot. When holding all locks, just
940 : * update the first one.
941 : */
942 29066722 : if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
943 : {
944 28887156 : int lockno = holdingAllLocks ? 0 : MyLockNo;
945 :
946 28887156 : WALInsertLocks[lockno].l.lastImportantAt = StartPos;
947 : }
948 : }
949 : else
950 : {
951 : /*
952 : * This was an xlog-switch record, but the current insert location was
953 : * already exactly at the beginning of a segment, so there was no need
954 : * to do anything.
955 : */
956 : }
957 :
958 : /*
959 : * Done! Let others know that we're finished.
960 : */
961 29066834 : WALInsertLockRelease();
962 :
963 29066834 : END_CRIT_SECTION();
964 :
965 29066834 : MarkCurrentTransactionIdLoggedIfAny();
966 :
967 : /*
968 : * Mark top transaction id is logged (if needed) so that we should not try
969 : * to log it again with the next WAL record in the current subtransaction.
970 : */
971 29066834 : if (topxid_included)
972 438 : MarkSubxactTopXidLogged();
973 :
974 : /*
975 : * Update shared LogwrtRqst.Write, if we crossed page boundary.
976 : */
977 29066834 : if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
978 : {
979 3301626 : SpinLockAcquire(&XLogCtl->info_lck);
980 : /* advance global request to include new block(s) */
981 3301626 : if (XLogCtl->LogwrtRqst.Write < EndPos)
982 3162672 : XLogCtl->LogwrtRqst.Write = EndPos;
983 3301626 : SpinLockRelease(&XLogCtl->info_lck);
984 3301626 : RefreshXLogWriteResult(LogwrtResult);
985 : }
986 :
987 : /*
988 : * If this was an XLOG_SWITCH record, flush the record and the empty
989 : * padding space that fills the rest of the segment, and perform
990 : * end-of-segment actions (eg, notifying archiver).
991 : */
992 29066834 : if (class == WALINSERT_SPECIAL_SWITCH)
993 : {
994 : TRACE_POSTGRESQL_WAL_SWITCH();
995 1546 : XLogFlush(EndPos);
996 :
997 : /*
998 : * Even though we reserved the rest of the segment for us, which is
999 : * reflected in EndPos, we return a pointer to just the end of the
1000 : * xlog-switch record.
1001 : */
1002 1546 : if (inserted)
1003 : {
1004 1434 : EndPos = StartPos + SizeOfXLogRecord;
1005 1434 : if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1006 : {
1007 0 : uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size);
1008 :
1009 0 : if (offset == EndPos % XLOG_BLCKSZ)
1010 0 : EndPos += SizeOfXLogLongPHD;
1011 : else
1012 0 : EndPos += SizeOfXLogShortPHD;
1013 : }
1014 : }
1015 : }
1016 :
1017 : #ifdef WAL_DEBUG
1018 : if (XLOG_DEBUG)
1019 : {
1020 : static XLogReaderState *debug_reader = NULL;
1021 : XLogRecord *record;
1022 : DecodedXLogRecord *decoded;
1023 : StringInfoData buf;
1024 : StringInfoData recordBuf;
1025 : char *errormsg = NULL;
1026 : MemoryContext oldCxt;
1027 :
1028 : oldCxt = MemoryContextSwitchTo(walDebugCxt);
1029 :
1030 : initStringInfo(&buf);
1031 : appendStringInfo(&buf, "INSERT @ %X/%08X: ", LSN_FORMAT_ARGS(EndPos));
1032 :
1033 : /*
1034 : * We have to piece together the WAL record data from the XLogRecData
1035 : * entries, so that we can pass it to the rm_desc function as one
1036 : * contiguous chunk.
1037 : */
1038 : initStringInfo(&recordBuf);
1039 : for (; rdata != NULL; rdata = rdata->next)
1040 : appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1041 :
1042 : /* We also need temporary space to decode the record. */
1043 : record = (XLogRecord *) recordBuf.data;
1044 : decoded = (DecodedXLogRecord *)
1045 : palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
1046 :
1047 : if (!debug_reader)
1048 : debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
1049 : XL_ROUTINE(.page_read = NULL,
1050 : .segment_open = NULL,
1051 : .segment_close = NULL),
1052 : NULL);
1053 : if (!debug_reader)
1054 : {
1055 : appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
1056 : }
1057 : else if (!DecodeXLogRecord(debug_reader,
1058 : decoded,
1059 : record,
1060 : EndPos,
1061 : &errormsg))
1062 : {
1063 : appendStringInfo(&buf, "error decoding record: %s",
1064 : errormsg ? errormsg : "no error message");
1065 : }
1066 : else
1067 : {
1068 : appendStringInfoString(&buf, " - ");
1069 :
1070 : debug_reader->record = decoded;
1071 : xlog_outdesc(&buf, debug_reader);
1072 : debug_reader->record = NULL;
1073 : }
1074 : elog(LOG, "%s", buf.data);
1075 :
1076 : pfree(decoded);
1077 : pfree(buf.data);
1078 : pfree(recordBuf.data);
1079 : MemoryContextSwitchTo(oldCxt);
1080 : }
1081 : #endif
1082 :
1083 : /*
1084 : * Update our global variables
1085 : */
1086 29066834 : ProcLastRecPtr = StartPos;
1087 29066834 : XactLastRecEnd = EndPos;
1088 :
1089 : /* Report WAL traffic to the instrumentation. */
1090 29066834 : if (inserted)
1091 : {
1092 29066722 : pgWalUsage.wal_bytes += rechdr->xl_tot_len;
1093 29066722 : pgWalUsage.wal_records++;
1094 29066722 : pgWalUsage.wal_fpi += num_fpi;
1095 :
1096 : /* Required for the flush of pending stats WAL data */
1097 29066722 : pgstat_report_fixed = true;
1098 : }
1099 :
1100 29066834 : return EndPos;
1101 : }
1102 :
1103 : /*
1104 : * Reserves the right amount of space for a record of given size from the WAL.
1105 : * *StartPos is set to the beginning of the reserved section, *EndPos to
1106 : * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1107 : * used to set the xl_prev of this record.
1108 : *
1109 : * This is the performance critical part of XLogInsert that must be serialized
1110 : * across backends. The rest can happen mostly in parallel. Try to keep this
1111 : * section as short as possible, insertpos_lck can be heavily contended on a
1112 : * busy system.
1113 : *
1114 : * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1115 : * where we actually copy the record to the reserved space.
1116 : *
1117 : * NB: Testing shows that XLogInsertRecord runs faster if this code is inlined;
1118 : * however, because there are two call sites, the compiler is reluctant to
1119 : * inline. We use pg_attribute_always_inline here to try to convince it.
1120 : */
1121 : static pg_attribute_always_inline void
1122 29065288 : ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1123 : XLogRecPtr *PrevPtr)
1124 : {
1125 29065288 : XLogCtlInsert *Insert = &XLogCtl->Insert;
1126 : uint64 startbytepos;
1127 : uint64 endbytepos;
1128 : uint64 prevbytepos;
1129 :
1130 29065288 : size = MAXALIGN(size);
1131 :
1132 : /* All (non xlog-switch) records should contain data. */
1133 : Assert(size > SizeOfXLogRecord);
1134 :
1135 : /*
1136 : * The duration the spinlock needs to be held is minimized by minimizing
1137 : * the calculations that have to be done while holding the lock. The
1138 : * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1139 : * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1140 : * page headers. The mapping between "usable" byte positions and physical
1141 : * positions (XLogRecPtrs) can be done outside the locked region, and
1142 : * because the usable byte position doesn't include any headers, reserving
1143 : * X bytes from WAL is almost as simple as "CurrBytePos += X".
1144 : */
1145 29065288 : SpinLockAcquire(&Insert->insertpos_lck);
1146 :
1147 29065288 : startbytepos = Insert->CurrBytePos;
1148 29065288 : endbytepos = startbytepos + size;
1149 29065288 : prevbytepos = Insert->PrevBytePos;
1150 29065288 : Insert->CurrBytePos = endbytepos;
1151 29065288 : Insert->PrevBytePos = startbytepos;
1152 :
1153 29065288 : SpinLockRelease(&Insert->insertpos_lck);
1154 :
1155 29065288 : *StartPos = XLogBytePosToRecPtr(startbytepos);
1156 29065288 : *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1157 29065288 : *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1158 :
1159 : /*
1160 : * Check that the conversions between "usable byte positions" and
1161 : * XLogRecPtrs work consistently in both directions.
1162 : */
1163 : Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1164 : Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1165 : Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1166 29065288 : }
1167 :
1168 : /*
1169 : * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1170 : *
1171 : * A log-switch record is handled slightly differently. The rest of the
1172 : * segment will be reserved for this insertion, as indicated by the returned
1173 : * *EndPos value. However, if we are already at the beginning of the current
1174 : * segment, *StartPos and *EndPos are set to the current location without
1175 : * reserving any space, and the function returns false.
1176 : */
1177 : static bool
1178 1546 : ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1179 : {
1180 1546 : XLogCtlInsert *Insert = &XLogCtl->Insert;
1181 : uint64 startbytepos;
1182 : uint64 endbytepos;
1183 : uint64 prevbytepos;
1184 1546 : uint32 size = MAXALIGN(SizeOfXLogRecord);
1185 : XLogRecPtr ptr;
1186 : uint32 segleft;
1187 :
1188 : /*
1189 : * These calculations are a bit heavy-weight to be done while holding a
1190 : * spinlock, but since we're holding all the WAL insertion locks, there
1191 : * are no other inserters competing for it. GetXLogInsertRecPtr() does
1192 : * compete for it, but that's not called very frequently.
1193 : */
1194 1546 : SpinLockAcquire(&Insert->insertpos_lck);
1195 :
1196 1546 : startbytepos = Insert->CurrBytePos;
1197 :
1198 1546 : ptr = XLogBytePosToEndRecPtr(startbytepos);
1199 1546 : if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1200 : {
1201 112 : SpinLockRelease(&Insert->insertpos_lck);
1202 112 : *EndPos = *StartPos = ptr;
1203 112 : return false;
1204 : }
1205 :
1206 1434 : endbytepos = startbytepos + size;
1207 1434 : prevbytepos = Insert->PrevBytePos;
1208 :
1209 1434 : *StartPos = XLogBytePosToRecPtr(startbytepos);
1210 1434 : *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1211 :
1212 1434 : segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1213 1434 : if (segleft != wal_segment_size)
1214 : {
1215 : /* consume the rest of the segment */
1216 1434 : *EndPos += segleft;
1217 1434 : endbytepos = XLogRecPtrToBytePos(*EndPos);
1218 : }
1219 1434 : Insert->CurrBytePos = endbytepos;
1220 1434 : Insert->PrevBytePos = startbytepos;
1221 :
1222 1434 : SpinLockRelease(&Insert->insertpos_lck);
1223 :
1224 1434 : *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1225 :
1226 : Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
1227 : Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1228 : Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1229 : Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1230 :
1231 1434 : return true;
1232 : }
1233 :
1234 : /*
1235 : * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
1236 : * area in the WAL.
1237 : */
1238 : static void
1239 29066722 : CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1240 : XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
1241 : {
1242 : char *currpos;
1243 : int freespace;
1244 : int written;
1245 : XLogRecPtr CurrPos;
1246 : XLogPageHeader pagehdr;
1247 :
1248 : /*
1249 : * Get a pointer to the right place in the right WAL buffer to start
1250 : * inserting to.
1251 : */
1252 29066722 : CurrPos = StartPos;
1253 29066722 : currpos = GetXLogBuffer(CurrPos, tli);
1254 29066722 : freespace = INSERT_FREESPACE(CurrPos);
1255 :
1256 : /*
1257 : * there should be enough space for at least the first field (xl_tot_len)
1258 : * on this page.
1259 : */
1260 : Assert(freespace >= sizeof(uint32));
1261 :
1262 : /* Copy record data */
1263 29066722 : written = 0;
1264 137309022 : while (rdata != NULL)
1265 : {
1266 108242300 : const char *rdata_data = rdata->data;
1267 108242300 : int rdata_len = rdata->len;
1268 :
1269 111772158 : while (rdata_len > freespace)
1270 : {
1271 : /*
1272 : * Write what fits on this page, and continue on the next page.
1273 : */
1274 : Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1275 3529858 : memcpy(currpos, rdata_data, freespace);
1276 3529858 : rdata_data += freespace;
1277 3529858 : rdata_len -= freespace;
1278 3529858 : written += freespace;
1279 3529858 : CurrPos += freespace;
1280 :
1281 : /*
1282 : * Get pointer to beginning of next page, and set the xlp_rem_len
1283 : * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1284 : *
1285 : * It's safe to set the contrecord flag and xlp_rem_len without a
1286 : * lock on the page. All the other flags were already set when the
1287 : * page was initialized, in AdvanceXLInsertBuffer, and we're the
1288 : * only backend that needs to set the contrecord flag.
1289 : */
1290 3529858 : currpos = GetXLogBuffer(CurrPos, tli);
1291 3529858 : pagehdr = (XLogPageHeader) currpos;
1292 3529858 : pagehdr->xlp_rem_len = write_len - written;
1293 3529858 : pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1294 :
1295 : /* skip over the page header */
1296 3529858 : if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1297 : {
1298 2360 : CurrPos += SizeOfXLogLongPHD;
1299 2360 : currpos += SizeOfXLogLongPHD;
1300 : }
1301 : else
1302 : {
1303 3527498 : CurrPos += SizeOfXLogShortPHD;
1304 3527498 : currpos += SizeOfXLogShortPHD;
1305 : }
1306 3529858 : freespace = INSERT_FREESPACE(CurrPos);
1307 : }
1308 :
1309 : Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1310 108242300 : memcpy(currpos, rdata_data, rdata_len);
1311 108242300 : currpos += rdata_len;
1312 108242300 : CurrPos += rdata_len;
1313 108242300 : freespace -= rdata_len;
1314 108242300 : written += rdata_len;
1315 :
1316 108242300 : rdata = rdata->next;
1317 : }
1318 : Assert(written == write_len);
1319 :
1320 : /*
1321 : * If this was an xlog-switch, it's not enough to write the switch record,
1322 : * we also have to consume all the remaining space in the WAL segment. We
1323 : * have already reserved that space, but we need to actually fill it.
1324 : */
1325 29066722 : if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
1326 : {
1327 : /* An xlog-switch record doesn't contain any data besides the header */
1328 : Assert(write_len == SizeOfXLogRecord);
1329 :
1330 : /* Assert that we did reserve the right amount of space */
1331 : Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1332 :
1333 : /* Use up all the remaining space on the current page */
1334 1434 : CurrPos += freespace;
1335 :
1336 : /*
1337 : * Cause all remaining pages in the segment to be flushed, leaving the
1338 : * XLog position where it should be, at the start of the next segment.
1339 : * We do this one page at a time, to make sure we don't deadlock
1340 : * against ourselves if wal_buffers < wal_segment_size.
1341 : */
1342 1371378 : while (CurrPos < EndPos)
1343 : {
1344 : /*
1345 : * The minimal action to flush the page would be to call
1346 : * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1347 : * AdvanceXLInsertBuffer(...). The page would be left initialized
1348 : * mostly to zeros, except for the page header (always the short
1349 : * variant, as this is never a segment's first page).
1350 : *
1351 : * The large vistas of zeros are good for compressibility, but the
1352 : * headers interrupting them every XLOG_BLCKSZ (with values that
1353 : * differ from page to page) are not. The effect varies with
1354 : * compression tool, but bzip2 for instance compresses about an
1355 : * order of magnitude worse if those headers are left in place.
1356 : *
1357 : * Rather than complicating AdvanceXLInsertBuffer itself (which is
1358 : * called in heavily-loaded circumstances as well as this lightly-
1359 : * loaded one) with variant behavior, we just use GetXLogBuffer
1360 : * (which itself calls the two methods we need) to get the pointer
1361 : * and zero most of the page. Then we just zero the page header.
1362 : */
1363 1369944 : currpos = GetXLogBuffer(CurrPos, tli);
1364 5479776 : MemSet(currpos, 0, SizeOfXLogShortPHD);
1365 :
1366 1369944 : CurrPos += XLOG_BLCKSZ;
1367 : }
1368 : }
1369 : else
1370 : {
1371 : /* Align the end position, so that the next record starts aligned */
1372 29065288 : CurrPos = MAXALIGN64(CurrPos);
1373 : }
1374 :
1375 29066722 : if (CurrPos != EndPos)
1376 0 : ereport(PANIC,
1377 : errcode(ERRCODE_DATA_CORRUPTED),
1378 : errmsg_internal("space reserved for WAL record does not match what was written"));
1379 29066722 : }
1380 :
1381 : /*
1382 : * Acquire a WAL insertion lock, for inserting to WAL.
1383 : */
1384 : static void
1385 29078538 : WALInsertLockAcquire(void)
1386 : {
1387 : bool immed;
1388 :
1389 : /*
1390 : * It doesn't matter which of the WAL insertion locks we acquire, so try
1391 : * the one we used last time. If the system isn't particularly busy, it's
1392 : * a good bet that it's still available, and it's good to have some
1393 : * affinity to a particular lock so that you don't unnecessarily bounce
1394 : * cache lines between processes when there's no contention.
1395 : *
1396 : * If this is the first time through in this backend, pick a lock
1397 : * (semi-)randomly. This allows the locks to be used evenly if you have a
1398 : * lot of very short connections.
1399 : */
1400 : static int lockToTry = -1;
1401 :
1402 29078538 : if (lockToTry == -1)
1403 15772 : lockToTry = MyProcNumber % NUM_XLOGINSERT_LOCKS;
1404 29078538 : MyLockNo = lockToTry;
1405 :
1406 : /*
1407 : * The insertingAt value is initially set to 0, as we don't know our
1408 : * insert location yet.
1409 : */
1410 29078538 : immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1411 29078538 : if (!immed)
1412 : {
1413 : /*
1414 : * If we couldn't get the lock immediately, try another lock next
1415 : * time. On a system with more insertion locks than concurrent
1416 : * inserters, this causes all the inserters to eventually migrate to a
1417 : * lock that no-one else is using. On a system with more inserters
1418 : * than locks, it still helps to distribute the inserters evenly
1419 : * across the locks.
1420 : */
1421 71170 : lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1422 : }
1423 29078538 : }
1424 :
1425 : /*
1426 : * Acquire all WAL insertion locks, to prevent other backends from inserting
1427 : * to WAL.
1428 : */
1429 : static void
1430 8466 : WALInsertLockAcquireExclusive(void)
1431 : {
1432 : int i;
1433 :
1434 : /*
1435 : * When holding all the locks, all but the last lock's insertingAt
1436 : * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1437 : * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1438 : */
1439 67728 : for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1440 : {
1441 59262 : LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1442 59262 : LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1443 59262 : &WALInsertLocks[i].l.insertingAt,
1444 : PG_UINT64_MAX);
1445 : }
1446 : /* Variable value reset to 0 at release */
1447 8466 : LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1448 :
1449 8466 : holdingAllLocks = true;
1450 8466 : }
1451 :
1452 : /*
1453 : * Release our insertion lock (or locks, if we're holding them all).
1454 : *
1455 : * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1456 : * next time the lock is acquired.
1457 : */
1458 : static void
1459 29087004 : WALInsertLockRelease(void)
1460 : {
1461 29087004 : if (holdingAllLocks)
1462 : {
1463 : int i;
1464 :
1465 76194 : for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1466 67728 : LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1467 67728 : &WALInsertLocks[i].l.insertingAt,
1468 : 0);
1469 :
1470 8466 : holdingAllLocks = false;
1471 : }
1472 : else
1473 : {
1474 29078538 : LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1475 29078538 : &WALInsertLocks[MyLockNo].l.insertingAt,
1476 : 0);
1477 : }
1478 29087004 : }
1479 :
1480 : /*
1481 : * Update our insertingAt value, to let others know that we've finished
1482 : * inserting up to that point.
1483 : */
1484 : static void
1485 5260460 : WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1486 : {
1487 5260460 : if (holdingAllLocks)
1488 : {
1489 : /*
1490 : * We use the last lock to mark our actual position, see comments in
1491 : * WALInsertLockAcquireExclusive.
1492 : */
1493 1362736 : LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1494 1362736 : &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1495 : insertingAt);
1496 : }
1497 : else
1498 3897724 : LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1499 3897724 : &WALInsertLocks[MyLockNo].l.insertingAt,
1500 : insertingAt);
1501 5260460 : }
1502 :
1503 : /*
1504 : * Wait for any WAL insertions < upto to finish.
1505 : *
1506 : * Returns the location of the oldest insertion that is still in-progress.
1507 : * Any WAL prior to that point has been fully copied into WAL buffers, and
1508 : * can be flushed out to disk. Because this waits for any insertions older
1509 : * than 'upto' to finish, the return value is always >= 'upto'.
1510 : *
1511 : * Note: When you are about to write out WAL, you must call this function
1512 : * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1513 : * need to wait for an insertion to finish (or at least advance to next
1514 : * uninitialized page), and the inserter might need to evict an old WAL buffer
1515 : * to make room for a new one, which in turn requires WALWriteLock.
1516 : */
1517 : static XLogRecPtr
1518 4202504 : WaitXLogInsertionsToFinish(XLogRecPtr upto)
1519 : {
1520 : uint64 bytepos;
1521 : XLogRecPtr inserted;
1522 : XLogRecPtr reservedUpto;
1523 : XLogRecPtr finishedUpto;
1524 4202504 : XLogCtlInsert *Insert = &XLogCtl->Insert;
1525 : int i;
1526 :
1527 4202504 : if (MyProc == NULL)
1528 0 : elog(PANIC, "cannot wait without a PGPROC structure");
1529 :
1530 : /*
1531 : * Check if there's any work to do. Use a barrier to ensure we get the
1532 : * freshest value.
1533 : */
1534 4202504 : inserted = pg_atomic_read_membarrier_u64(&XLogCtl->logInsertResult);
1535 4202504 : if (upto <= inserted)
1536 3436032 : return inserted;
1537 :
1538 : /* Read the current insert position */
1539 766472 : SpinLockAcquire(&Insert->insertpos_lck);
1540 766472 : bytepos = Insert->CurrBytePos;
1541 766472 : SpinLockRelease(&Insert->insertpos_lck);
1542 766472 : reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1543 :
1544 : /*
1545 : * No-one should request to flush a piece of WAL that hasn't even been
1546 : * reserved yet. However, it can happen if there is a block with a bogus
1547 : * LSN on disk, for example. XLogFlush checks for that situation and
1548 : * complains, but only after the flush. Here we just assume that to mean
1549 : * that all WAL that has been reserved needs to be finished. In this
1550 : * corner-case, the return value can be smaller than 'upto' argument.
1551 : */
1552 766472 : if (upto > reservedUpto)
1553 : {
1554 0 : ereport(LOG,
1555 : errmsg("request to flush past end of generated WAL; request %X/%08X, current position %X/%08X",
1556 : LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto)));
1557 0 : upto = reservedUpto;
1558 : }
1559 :
1560 : /*
1561 : * Loop through all the locks, sleeping on any in-progress insert older
1562 : * than 'upto'.
1563 : *
1564 : * finishedUpto is our return value, indicating the point upto which all
1565 : * the WAL insertions have been finished. Initialize it to the head of
1566 : * reserved WAL, and as we iterate through the insertion locks, back it
1567 : * out for any insertion that's still in progress.
1568 : */
1569 766472 : finishedUpto = reservedUpto;
1570 6898248 : for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1571 : {
1572 6131776 : XLogRecPtr insertingat = InvalidXLogRecPtr;
1573 :
1574 : do
1575 : {
1576 : /*
1577 : * See if this insertion is in progress. LWLockWaitForVar will
1578 : * wait for the lock to be released, or for the 'value' to be set
1579 : * by a LWLockUpdateVar call. When a lock is initially acquired,
1580 : * its value is 0 (InvalidXLogRecPtr), which means that we don't
1581 : * know where it's inserting yet. We will have to wait for it. If
1582 : * it's a small insertion, the record will most likely fit on the
1583 : * same page and the inserter will release the lock without ever
1584 : * calling LWLockUpdateVar. But if it has to sleep, it will
1585 : * advertise the insertion point with LWLockUpdateVar before
1586 : * sleeping.
1587 : *
1588 : * In this loop we are only waiting for insertions that started
1589 : * before WaitXLogInsertionsToFinish was called. The lack of
1590 : * memory barriers in the loop means that we might see locks as
1591 : * "unused" that have since become used. This is fine because
1592 : * they only can be used for later insertions that we would not
1593 : * want to wait on anyway. Not taking a lock to acquire the
1594 : * current insertingAt value means that we might see older
1595 : * insertingAt values. This is also fine, because if we read a
1596 : * value too old, we will add ourselves to the wait queue, which
1597 : * contains atomic operations.
1598 : */
1599 6207518 : if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1600 6207518 : &WALInsertLocks[i].l.insertingAt,
1601 : insertingat, &insertingat))
1602 : {
1603 : /* the lock was free, so no insertion in progress */
1604 4536494 : insertingat = InvalidXLogRecPtr;
1605 4536494 : break;
1606 : }
1607 :
1608 : /*
1609 : * This insertion is still in progress. Have to wait, unless the
1610 : * inserter has proceeded past 'upto'.
1611 : */
1612 1671024 : } while (insertingat < upto);
1613 :
1614 6131776 : if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1615 610766 : finishedUpto = insertingat;
1616 : }
1617 :
1618 : /*
1619 : * Advance the limit we know to have been inserted and return the freshest
1620 : * value we know of, which might be beyond what we requested if somebody
1621 : * is concurrently doing this with an 'upto' pointer ahead of us.
1622 : */
1623 766472 : finishedUpto = pg_atomic_monotonic_advance_u64(&XLogCtl->logInsertResult,
1624 : finishedUpto);
1625 :
1626 766472 : return finishedUpto;
1627 : }
1628 :
1629 : /*
1630 : * Get a pointer to the right location in the WAL buffer containing the
1631 : * given XLogRecPtr.
1632 : *
1633 : * If the page is not initialized yet, it is initialized. That might require
1634 : * evicting an old dirty buffer from the buffer cache, which means I/O.
1635 : *
1636 : * The caller must ensure that the page containing the requested location
1637 : * isn't evicted yet, and won't be evicted. The way to ensure that is to
1638 : * hold onto a WAL insertion lock with the insertingAt position set to
1639 : * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1640 : * to evict an old page from the buffer. (This means that once you call
1641 : * GetXLogBuffer() with a given 'ptr', you must not access anything before
1642 : * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1643 : * later, because older buffers might be recycled already)
1644 : */
1645 : static char *
1646 33966548 : GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
1647 : {
1648 : int idx;
1649 : XLogRecPtr endptr;
1650 : static uint64 cachedPage = 0;
1651 : static char *cachedPos = NULL;
1652 : XLogRecPtr expectedEndPtr;
1653 :
1654 : /*
1655 : * Fast path for the common case that we need to access again the same
1656 : * page as last time.
1657 : */
1658 33966548 : if (ptr / XLOG_BLCKSZ == cachedPage)
1659 : {
1660 : Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1661 : Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1662 27531126 : return cachedPos + ptr % XLOG_BLCKSZ;
1663 : }
1664 :
1665 : /*
1666 : * The XLog buffer cache is organized so that a page is always loaded to a
1667 : * particular buffer. That way we can easily calculate the buffer a given
1668 : * page must be loaded into, from the XLogRecPtr alone.
1669 : */
1670 6435422 : idx = XLogRecPtrToBufIdx(ptr);
1671 :
1672 : /*
1673 : * See what page is loaded in the buffer at the moment. It could be the
1674 : * page we're looking for, or something older. It can't be anything newer
1675 : * - that would imply the page we're looking for has already been written
1676 : * out to disk and evicted, and the caller is responsible for making sure
1677 : * that doesn't happen.
1678 : *
1679 : * We don't hold a lock while we read the value. If someone is just about
1680 : * to initialize or has just initialized the page, it's possible that we
1681 : * get InvalidXLogRecPtr. That's ok, we'll grab the mapping lock (in
1682 : * AdvanceXLInsertBuffer) and retry if we see anything other than the page
1683 : * we're looking for.
1684 : */
1685 6435422 : expectedEndPtr = ptr;
1686 6435422 : expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1687 :
1688 6435422 : endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1689 6435422 : if (expectedEndPtr != endptr)
1690 : {
1691 : XLogRecPtr initializedUpto;
1692 :
1693 : /*
1694 : * Before calling AdvanceXLInsertBuffer(), which can block, let others
1695 : * know how far we're finished with inserting the record.
1696 : *
1697 : * NB: If 'ptr' points to just after the page header, advertise a
1698 : * position at the beginning of the page rather than 'ptr' itself. If
1699 : * there are no other insertions running, someone might try to flush
1700 : * up to our advertised location. If we advertised a position after
1701 : * the page header, someone might try to flush the page header, even
1702 : * though page might actually not be initialized yet. As the first
1703 : * inserter on the page, we are effectively responsible for making
1704 : * sure that it's initialized, before we let insertingAt to move past
1705 : * the page header.
1706 : */
1707 5260460 : if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1708 12242 : XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
1709 12242 : initializedUpto = ptr - SizeOfXLogShortPHD;
1710 5248218 : else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1711 2574 : XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
1712 1198 : initializedUpto = ptr - SizeOfXLogLongPHD;
1713 : else
1714 5247020 : initializedUpto = ptr;
1715 :
1716 5260460 : WALInsertLockUpdateInsertingAt(initializedUpto);
1717 :
1718 5260460 : AdvanceXLInsertBuffer(ptr, tli, false);
1719 5260460 : endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1720 :
1721 5260460 : if (expectedEndPtr != endptr)
1722 0 : elog(PANIC, "could not find WAL buffer for %X/%08X",
1723 : LSN_FORMAT_ARGS(ptr));
1724 : }
1725 : else
1726 : {
1727 : /*
1728 : * Make sure the initialization of the page is visible to us, and
1729 : * won't arrive later to overwrite the WAL data we write on the page.
1730 : */
1731 1174962 : pg_memory_barrier();
1732 : }
1733 :
1734 : /*
1735 : * Found the buffer holding this page. Return a pointer to the right
1736 : * offset within the page.
1737 : */
1738 6435422 : cachedPage = ptr / XLOG_BLCKSZ;
1739 6435422 : cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1740 :
1741 : Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1742 : Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1743 :
1744 6435422 : return cachedPos + ptr % XLOG_BLCKSZ;
1745 : }
1746 :
1747 : /*
1748 : * Read WAL data directly from WAL buffers, if available. Returns the number
1749 : * of bytes read successfully.
1750 : *
1751 : * Fewer than 'count' bytes may be read if some of the requested WAL data has
1752 : * already been evicted.
1753 : *
1754 : * No locks are taken.
1755 : *
1756 : * Caller should ensure that it reads no further than LogwrtResult.Write
1757 : * (which should have been updated by the caller when determining how far to
1758 : * read). The 'tli' argument is only used as a convenient safety check so that
1759 : * callers do not read from WAL buffers on a historical timeline.
1760 : */
1761 : Size
1762 196018 : WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count,
1763 : TimeLineID tli)
1764 : {
1765 196018 : char *pdst = dstbuf;
1766 196018 : XLogRecPtr recptr = startptr;
1767 : XLogRecPtr inserted;
1768 196018 : Size nbytes = count;
1769 :
1770 196018 : if (RecoveryInProgress() || tli != GetWALInsertionTimeLine())
1771 1828 : return 0;
1772 :
1773 : Assert(!XLogRecPtrIsInvalid(startptr));
1774 :
1775 : /*
1776 : * Caller should ensure that the requested data has been inserted into WAL
1777 : * buffers before we try to read it.
1778 : */
1779 194190 : inserted = pg_atomic_read_u64(&XLogCtl->logInsertResult);
1780 194190 : if (startptr + count > inserted)
1781 0 : ereport(ERROR,
1782 : errmsg("cannot read past end of generated WAL: requested %X/%08X, current position %X/%08X",
1783 : LSN_FORMAT_ARGS(startptr + count),
1784 : LSN_FORMAT_ARGS(inserted)));
1785 :
1786 : /*
1787 : * Loop through the buffers without a lock. For each buffer, atomically
1788 : * read and verify the end pointer, then copy the data out, and finally
1789 : * re-read and re-verify the end pointer.
1790 : *
1791 : * Once a page is evicted, it never returns to the WAL buffers, so if the
1792 : * end pointer matches the expected end pointer before and after we copy
1793 : * the data, then the right page must have been present during the data
1794 : * copy. Read barriers are necessary to ensure that the data copy actually
1795 : * happens between the two verification steps.
1796 : *
1797 : * If either verification fails, we simply terminate the loop and return
1798 : * with the data that had been already copied out successfully.
1799 : */
1800 215608 : while (nbytes > 0)
1801 : {
1802 210444 : uint32 offset = recptr % XLOG_BLCKSZ;
1803 210444 : int idx = XLogRecPtrToBufIdx(recptr);
1804 : XLogRecPtr expectedEndPtr;
1805 : XLogRecPtr endptr;
1806 : const char *page;
1807 : const char *psrc;
1808 : Size npagebytes;
1809 :
1810 : /*
1811 : * Calculate the end pointer we expect in the xlblocks array if the
1812 : * correct page is present.
1813 : */
1814 210444 : expectedEndPtr = recptr + (XLOG_BLCKSZ - offset);
1815 :
1816 : /*
1817 : * First verification step: check that the correct page is present in
1818 : * the WAL buffers.
1819 : */
1820 210444 : endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1821 210444 : if (expectedEndPtr != endptr)
1822 189022 : break;
1823 :
1824 : /*
1825 : * The correct page is present (or was at the time the endptr was
1826 : * read; must re-verify later). Calculate pointer to source data and
1827 : * determine how much data to read from this page.
1828 : */
1829 21422 : page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1830 21422 : psrc = page + offset;
1831 21422 : npagebytes = Min(nbytes, XLOG_BLCKSZ - offset);
1832 :
1833 : /*
1834 : * Ensure that the data copy and the first verification step are not
1835 : * reordered.
1836 : */
1837 21422 : pg_read_barrier();
1838 :
1839 : /* data copy */
1840 21422 : memcpy(pdst, psrc, npagebytes);
1841 :
1842 : /*
1843 : * Ensure that the data copy and the second verification step are not
1844 : * reordered.
1845 : */
1846 21422 : pg_read_barrier();
1847 :
1848 : /*
1849 : * Second verification step: check that the page we read from wasn't
1850 : * evicted while we were copying the data.
1851 : */
1852 21422 : endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1853 21422 : if (expectedEndPtr != endptr)
1854 4 : break;
1855 :
1856 21418 : pdst += npagebytes;
1857 21418 : recptr += npagebytes;
1858 21418 : nbytes -= npagebytes;
1859 : }
1860 :
1861 : Assert(pdst - dstbuf <= count);
1862 :
1863 194190 : return pdst - dstbuf;
1864 : }
1865 :
1866 : /*
1867 : * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1868 : * is the position starting from the beginning of WAL, excluding all WAL
1869 : * page headers.
1870 : */
1871 : static XLogRecPtr
1872 58138592 : XLogBytePosToRecPtr(uint64 bytepos)
1873 : {
1874 : uint64 fullsegs;
1875 : uint64 fullpages;
1876 : uint64 bytesleft;
1877 : uint32 seg_offset;
1878 : XLogRecPtr result;
1879 :
1880 58138592 : fullsegs = bytepos / UsableBytesInSegment;
1881 58138592 : bytesleft = bytepos % UsableBytesInSegment;
1882 :
1883 58138592 : if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1884 : {
1885 : /* fits on first page of segment */
1886 100860 : seg_offset = bytesleft + SizeOfXLogLongPHD;
1887 : }
1888 : else
1889 : {
1890 : /* account for the first page on segment with long header */
1891 58037732 : seg_offset = XLOG_BLCKSZ;
1892 58037732 : bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1893 :
1894 58037732 : fullpages = bytesleft / UsableBytesInPage;
1895 58037732 : bytesleft = bytesleft % UsableBytesInPage;
1896 :
1897 58037732 : seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1898 : }
1899 :
1900 58138592 : XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1901 :
1902 58138592 : return result;
1903 : }
1904 :
1905 : /*
1906 : * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1907 : * returns a pointer to the beginning of the page (ie. before page header),
1908 : * not to where the first xlog record on that page would go to. This is used
1909 : * when converting a pointer to the end of a record.
1910 : */
1911 : static XLogRecPtr
1912 29834740 : XLogBytePosToEndRecPtr(uint64 bytepos)
1913 : {
1914 : uint64 fullsegs;
1915 : uint64 fullpages;
1916 : uint64 bytesleft;
1917 : uint32 seg_offset;
1918 : XLogRecPtr result;
1919 :
1920 29834740 : fullsegs = bytepos / UsableBytesInSegment;
1921 29834740 : bytesleft = bytepos % UsableBytesInSegment;
1922 :
1923 29834740 : if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1924 : {
1925 : /* fits on first page of segment */
1926 160688 : if (bytesleft == 0)
1927 107726 : seg_offset = 0;
1928 : else
1929 52962 : seg_offset = bytesleft + SizeOfXLogLongPHD;
1930 : }
1931 : else
1932 : {
1933 : /* account for the first page on segment with long header */
1934 29674052 : seg_offset = XLOG_BLCKSZ;
1935 29674052 : bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1936 :
1937 29674052 : fullpages = bytesleft / UsableBytesInPage;
1938 29674052 : bytesleft = bytesleft % UsableBytesInPage;
1939 :
1940 29674052 : if (bytesleft == 0)
1941 28938 : seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
1942 : else
1943 29645114 : seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1944 : }
1945 :
1946 29834740 : XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1947 :
1948 29834740 : return result;
1949 : }
1950 :
1951 : /*
1952 : * Convert an XLogRecPtr to a "usable byte position".
1953 : */
1954 : static uint64
1955 4886 : XLogRecPtrToBytePos(XLogRecPtr ptr)
1956 : {
1957 : uint64 fullsegs;
1958 : uint32 fullpages;
1959 : uint32 offset;
1960 : uint64 result;
1961 :
1962 4886 : XLByteToSeg(ptr, fullsegs, wal_segment_size);
1963 :
1964 4886 : fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
1965 4886 : offset = ptr % XLOG_BLCKSZ;
1966 :
1967 4886 : if (fullpages == 0)
1968 : {
1969 1954 : result = fullsegs * UsableBytesInSegment;
1970 1954 : if (offset > 0)
1971 : {
1972 : Assert(offset >= SizeOfXLogLongPHD);
1973 486 : result += offset - SizeOfXLogLongPHD;
1974 : }
1975 : }
1976 : else
1977 : {
1978 2932 : result = fullsegs * UsableBytesInSegment +
1979 2932 : (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
1980 2932 : (fullpages - 1) * UsableBytesInPage; /* full pages */
1981 2932 : if (offset > 0)
1982 : {
1983 : Assert(offset >= SizeOfXLogShortPHD);
1984 2910 : result += offset - SizeOfXLogShortPHD;
1985 : }
1986 : }
1987 :
1988 4886 : return result;
1989 : }
1990 :
1991 : /*
1992 : * Initialize XLOG buffers, writing out old buffers if they still contain
1993 : * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
1994 : * true, initialize as many pages as we can without having to write out
1995 : * unwritten data. Any new pages are initialized to zeros, with pages headers
1996 : * initialized properly.
1997 : */
1998 : static void
1999 5269316 : AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
2000 : {
2001 5269316 : XLogCtlInsert *Insert = &XLogCtl->Insert;
2002 : int nextidx;
2003 : XLogRecPtr OldPageRqstPtr;
2004 : XLogwrtRqst WriteRqst;
2005 5269316 : XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
2006 : XLogRecPtr NewPageBeginPtr;
2007 : XLogPageHeader NewPage;
2008 : XLogRecPtr ReservedPtr;
2009 5269316 : int npages pg_attribute_unused() = 0;
2010 :
2011 : /*
2012 : * We must run the loop below inside the critical section as we expect
2013 : * XLogCtl->InitializedUpTo to eventually keep up. The most of callers
2014 : * already run inside the critical section. Except for WAL writer, which
2015 : * passed 'opportunistic == true', and therefore we don't perform
2016 : * operations that could error out.
2017 : *
2018 : * Start an explicit critical section anyway though.
2019 : */
2020 : Assert(CritSectionCount > 0 || opportunistic);
2021 5269316 : START_CRIT_SECTION();
2022 :
2023 : /*--
2024 : * Loop till we get all the pages in WAL buffer before 'upto' reserved for
2025 : * initialization. Multiple process can initialize different buffers with
2026 : * this loop in parallel as following.
2027 : *
2028 : * 1. Reserve page for initialization using XLogCtl->InitializeReserved.
2029 : * 2. Initialize the reserved page.
2030 : * 3. Attempt to advance XLogCtl->InitializedUpTo,
2031 : */
2032 5269316 : ReservedPtr = pg_atomic_read_u64(&XLogCtl->InitializeReserved);
2033 15293834 : while (upto >= ReservedPtr || opportunistic)
2034 : {
2035 : Assert(ReservedPtr % XLOG_BLCKSZ == 0);
2036 :
2037 : /*
2038 : * Get ending-offset of the buffer page we need to replace.
2039 : *
2040 : * We don't lookup into xlblocks, but rather calculate position we
2041 : * must wait to be written. If it was written, xlblocks will have this
2042 : * position (or uninitialized)
2043 : */
2044 10033374 : if (ReservedPtr + XLOG_BLCKSZ > XLogCtl->InitializedFrom + XLOG_BLCKSZ * XLOGbuffers)
2045 9479862 : OldPageRqstPtr = ReservedPtr + XLOG_BLCKSZ - (XLogRecPtr) XLOG_BLCKSZ * XLOGbuffers;
2046 : else
2047 553512 : OldPageRqstPtr = InvalidXLogRecPtr;
2048 :
2049 10033374 : if (LogwrtResult.Write < OldPageRqstPtr && opportunistic)
2050 : {
2051 : /*
2052 : * If we just want to pre-initialize as much as we can without
2053 : * flushing, give up now.
2054 : */
2055 8856 : upto = ReservedPtr - 1;
2056 8856 : break;
2057 : }
2058 :
2059 : /*
2060 : * Attempt to reserve the page for initialization. Failure means that
2061 : * this page got reserved by another process.
2062 : */
2063 10024518 : if (!pg_atomic_compare_exchange_u64(&XLogCtl->InitializeReserved,
2064 : &ReservedPtr,
2065 : ReservedPtr + XLOG_BLCKSZ))
2066 5017932 : continue;
2067 :
2068 : /*
2069 : * Wait till page gets correctly initialized up to OldPageRqstPtr.
2070 : */
2071 5006586 : nextidx = XLogRecPtrToBufIdx(ReservedPtr);
2072 5006942 : while (pg_atomic_read_u64(&XLogCtl->InitializedUpTo) < OldPageRqstPtr)
2073 356 : ConditionVariableSleep(&XLogCtl->InitializedUpToCondVar, WAIT_EVENT_WAL_BUFFER_INIT);
2074 5006586 : ConditionVariableCancelSleep();
2075 : Assert(pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]) == OldPageRqstPtr);
2076 :
2077 : /* Fall through if it's already written out. */
2078 5006586 : if (LogwrtResult.Write < OldPageRqstPtr)
2079 : {
2080 : /* Nope, got work to do. */
2081 :
2082 : /* Advance shared memory write request position */
2083 3953612 : SpinLockAcquire(&XLogCtl->info_lck);
2084 3953612 : if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2085 1232964 : XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2086 3953612 : SpinLockRelease(&XLogCtl->info_lck);
2087 :
2088 : /*
2089 : * Acquire an up-to-date LogwrtResult value and see if we still
2090 : * need to write it or if someone else already did.
2091 : */
2092 3953612 : RefreshXLogWriteResult(LogwrtResult);
2093 3953612 : if (LogwrtResult.Write < OldPageRqstPtr)
2094 : {
2095 3929646 : WaitXLogInsertionsToFinish(OldPageRqstPtr);
2096 :
2097 3929646 : LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2098 :
2099 3929646 : RefreshXLogWriteResult(LogwrtResult);
2100 3929646 : if (LogwrtResult.Write >= OldPageRqstPtr)
2101 : {
2102 : /* OK, someone wrote it already */
2103 54474 : LWLockRelease(WALWriteLock);
2104 : }
2105 : else
2106 : {
2107 : /* Have to write it ourselves */
2108 : TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2109 3875172 : WriteRqst.Write = OldPageRqstPtr;
2110 3875172 : WriteRqst.Flush = 0;
2111 3875172 : XLogWrite(WriteRqst, tli, false);
2112 3875172 : LWLockRelease(WALWriteLock);
2113 3875172 : pgWalUsage.wal_buffers_full++;
2114 : TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2115 :
2116 : /*
2117 : * Required for the flush of pending stats WAL data, per
2118 : * update of pgWalUsage.
2119 : */
2120 3875172 : pgstat_report_fixed = true;
2121 : }
2122 : }
2123 : }
2124 :
2125 : /*
2126 : * Now the next buffer slot is free and we can set it up to be the
2127 : * next output page.
2128 : */
2129 5006586 : NewPageBeginPtr = ReservedPtr;
2130 5006586 : NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2131 :
2132 5006586 : NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2133 :
2134 : /*
2135 : * Mark the xlblock with InvalidXLogRecPtr and issue a write barrier
2136 : * before initializing. Otherwise, the old page may be partially
2137 : * zeroed but look valid.
2138 : */
2139 5006586 : pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], InvalidXLogRecPtr);
2140 5006586 : pg_write_barrier();
2141 :
2142 : /*
2143 : * Be sure to re-zero the buffer so that bytes beyond what we've
2144 : * written will look like zeroes and not valid XLOG records...
2145 : */
2146 5006586 : MemSet(NewPage, 0, XLOG_BLCKSZ);
2147 :
2148 : /*
2149 : * Fill the new page's header
2150 : */
2151 5006586 : NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2152 :
2153 : /* NewPage->xlp_info = 0; */ /* done by memset */
2154 5006586 : NewPage->xlp_tli = tli;
2155 5006586 : NewPage->xlp_pageaddr = NewPageBeginPtr;
2156 :
2157 : /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2158 :
2159 : /*
2160 : * If online backup is not in progress, mark the header to indicate
2161 : * that WAL records beginning in this page have removable backup
2162 : * blocks. This allows the WAL archiver to know whether it is safe to
2163 : * compress archived WAL data by transforming full-block records into
2164 : * the non-full-block format. It is sufficient to record this at the
2165 : * page level because we force a page switch (in fact a segment
2166 : * switch) when starting a backup, so the flag will be off before any
2167 : * records can be written during the backup. At the end of a backup,
2168 : * the last page will be marked as all unsafe when perhaps only part
2169 : * is unsafe, but at worst the archiver would miss the opportunity to
2170 : * compress a few records.
2171 : */
2172 5006586 : if (Insert->runningBackups == 0)
2173 4748088 : NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2174 :
2175 : /*
2176 : * If first page of an XLOG segment file, make it a long header.
2177 : */
2178 5006586 : if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
2179 : {
2180 3590 : XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2181 :
2182 3590 : NewLongPage->xlp_sysid = ControlFile->system_identifier;
2183 3590 : NewLongPage->xlp_seg_size = wal_segment_size;
2184 3590 : NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2185 3590 : NewPage->xlp_info |= XLP_LONG_HEADER;
2186 : }
2187 :
2188 : /*
2189 : * Make sure the initialization of the page becomes visible to others
2190 : * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2191 : * holding a lock.
2192 : */
2193 5006586 : pg_write_barrier();
2194 :
2195 : /*-----
2196 : * Update the value of XLogCtl->xlblocks[nextidx] and try to advance
2197 : * XLogCtl->InitializedUpTo in a lock-less manner.
2198 : *
2199 : * First, let's provide a formal proof of the algorithm. Let it be 'n'
2200 : * process with the following variables in shared memory:
2201 : * f - an array of 'n' boolean flags,
2202 : * v - atomic integer variable.
2203 : *
2204 : * Also, let
2205 : * i - a number of a process,
2206 : * j - local integer variable,
2207 : * CAS(var, oldval, newval) - compare-and-swap atomic operation
2208 : * returning true on success,
2209 : * write_barrier()/read_barrier() - memory barriers.
2210 : *
2211 : * The pseudocode for each process is the following.
2212 : *
2213 : * j := i
2214 : * f[i] := true
2215 : * write_barrier()
2216 : * while CAS(v, j, j + 1):
2217 : * j := j + 1
2218 : * read_barrier()
2219 : * if not f[j]:
2220 : * break
2221 : *
2222 : * Let's prove that v eventually reaches the value of n.
2223 : * 1. Prove by contradiction. Assume v doesn't reach n and stucks
2224 : * on k, where k < n.
2225 : * 2. Process k attempts CAS(v, k, k + 1). 1). If, as we assumed, v
2226 : * gets stuck at k, then this CAS operation must fail. Therefore,
2227 : * v < k when process k attempts CAS(v, k, k + 1).
2228 : * 3. If, as we assumed, v gets stuck at k, then the value k of v
2229 : * must be achieved by some process m, where m < k. The process
2230 : * m must observe f[k] == false. Otherwise, it will later attempt
2231 : * CAS(v, k, k + 1) with success.
2232 : * 4. Therefore, corresponding read_barrier() (while j == k) on
2233 : * process m reached before write_barrier() of process k. But then
2234 : * process k attempts CAS(v, k, k + 1) after process m successfully
2235 : * incremented v to k, and that CAS operation must succeed.
2236 : * That leads to a contradiction. So, there is no such k (k < n)
2237 : * where v gets stuck. Q.E.D.
2238 : *
2239 : * To apply this proof to the code below, we assume
2240 : * XLogCtl->InitializedUpTo will play the role of v with XLOG_BLCKSZ
2241 : * granularity. We also assume setting XLogCtl->xlblocks[nextidx] to
2242 : * NewPageEndPtr to play the role of setting f[i] to true. Also, note
2243 : * that processes can't concurrently map different xlog locations to
2244 : * the same nextidx because we previously requested that
2245 : * XLogCtl->InitializedUpTo >= OldPageRqstPtr. So, a xlog buffer can
2246 : * be taken for initialization only once the previous initialization
2247 : * takes effect on XLogCtl->InitializedUpTo.
2248 : */
2249 :
2250 5006586 : pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], NewPageEndPtr);
2251 :
2252 5006586 : pg_write_barrier();
2253 :
2254 5076130 : while (pg_atomic_compare_exchange_u64(&XLogCtl->InitializedUpTo, &NewPageBeginPtr, NewPageEndPtr))
2255 : {
2256 5015976 : NewPageBeginPtr = NewPageEndPtr;
2257 5015976 : NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2258 5015976 : nextidx = XLogRecPtrToBufIdx(NewPageBeginPtr);
2259 :
2260 5015976 : pg_read_barrier();
2261 :
2262 5015976 : if (pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]) != NewPageEndPtr)
2263 : {
2264 : /*
2265 : * Page at nextidx wasn't initialized yet, so we can't move
2266 : * InitializedUpto further. It will be moved by backend which
2267 : * will initialize nextidx.
2268 : */
2269 4946432 : ConditionVariableBroadcast(&XLogCtl->InitializedUpToCondVar);
2270 4946432 : break;
2271 : }
2272 : }
2273 :
2274 5006586 : npages++;
2275 : }
2276 :
2277 5269316 : END_CRIT_SECTION();
2278 :
2279 : /*
2280 : * All the pages in WAL buffer before 'upto' were reserved for
2281 : * initialization. However, some pages might be reserved by concurrent
2282 : * processes. Wait till they finish initialization.
2283 : */
2284 6690440 : while (upto >= pg_atomic_read_u64(&XLogCtl->InitializedUpTo))
2285 1421124 : ConditionVariableSleep(&XLogCtl->InitializedUpToCondVar, WAIT_EVENT_WAL_BUFFER_INIT);
2286 5269316 : ConditionVariableCancelSleep();
2287 :
2288 5269316 : pg_read_barrier();
2289 :
2290 : #ifdef WAL_DEBUG
2291 : if (XLOG_DEBUG && npages > 0)
2292 : {
2293 : elog(DEBUG1, "initialized %d pages, up to %X/%08X",
2294 : npages, LSN_FORMAT_ARGS(NewPageEndPtr));
2295 : }
2296 : #endif
2297 5269316 : }
2298 :
2299 : /*
2300 : * Calculate CheckPointSegments based on max_wal_size_mb and
2301 : * checkpoint_completion_target.
2302 : */
2303 : static void
2304 15180 : CalculateCheckpointSegments(void)
2305 : {
2306 : double target;
2307 :
2308 : /*-------
2309 : * Calculate the distance at which to trigger a checkpoint, to avoid
2310 : * exceeding max_wal_size_mb. This is based on two assumptions:
2311 : *
2312 : * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2313 : * WAL for two checkpoint cycles to allow us to recover from the
2314 : * secondary checkpoint if the first checkpoint failed, though we
2315 : * only did this on the primary anyway, not on standby. Keeping just
2316 : * one checkpoint simplifies processing and reduces disk space in
2317 : * many smaller databases.)
2318 : * b) during checkpoint, we consume checkpoint_completion_target *
2319 : * number of segments consumed between checkpoints.
2320 : *-------
2321 : */
2322 15180 : target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2323 15180 : (1.0 + CheckPointCompletionTarget);
2324 :
2325 : /* round down */
2326 15180 : CheckPointSegments = (int) target;
2327 :
2328 15180 : if (CheckPointSegments < 1)
2329 20 : CheckPointSegments = 1;
2330 15180 : }
2331 :
2332 : void
2333 11032 : assign_max_wal_size(int newval, void *extra)
2334 : {
2335 11032 : max_wal_size_mb = newval;
2336 11032 : CalculateCheckpointSegments();
2337 11032 : }
2338 :
2339 : void
2340 2204 : assign_checkpoint_completion_target(double newval, void *extra)
2341 : {
2342 2204 : CheckPointCompletionTarget = newval;
2343 2204 : CalculateCheckpointSegments();
2344 2204 : }
2345 :
2346 : bool
2347 4250 : check_wal_segment_size(int *newval, void **extra, GucSource source)
2348 : {
2349 4250 : if (!IsValidWalSegSize(*newval))
2350 : {
2351 0 : GUC_check_errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.");
2352 0 : return false;
2353 : }
2354 :
2355 4250 : return true;
2356 : }
2357 :
2358 : /*
2359 : * At a checkpoint, how many WAL segments to recycle as preallocated future
2360 : * XLOG segments? Returns the highest segment that should be preallocated.
2361 : */
2362 : static XLogSegNo
2363 3362 : XLOGfileslop(XLogRecPtr lastredoptr)
2364 : {
2365 : XLogSegNo minSegNo;
2366 : XLogSegNo maxSegNo;
2367 : double distance;
2368 : XLogSegNo recycleSegNo;
2369 :
2370 : /*
2371 : * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2372 : * correspond to. Always recycle enough segments to meet the minimum, and
2373 : * remove enough segments to stay below the maximum.
2374 : */
2375 3362 : minSegNo = lastredoptr / wal_segment_size +
2376 3362 : ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
2377 3362 : maxSegNo = lastredoptr / wal_segment_size +
2378 3362 : ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2379 :
2380 : /*
2381 : * Between those limits, recycle enough segments to get us through to the
2382 : * estimated end of next checkpoint.
2383 : *
2384 : * To estimate where the next checkpoint will finish, assume that the
2385 : * system runs steadily consuming CheckPointDistanceEstimate bytes between
2386 : * every checkpoint.
2387 : */
2388 3362 : distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2389 : /* add 10% for good measure. */
2390 3362 : distance *= 1.10;
2391 :
2392 3362 : recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
2393 : wal_segment_size);
2394 :
2395 3362 : if (recycleSegNo < minSegNo)
2396 2384 : recycleSegNo = minSegNo;
2397 3362 : if (recycleSegNo > maxSegNo)
2398 760 : recycleSegNo = maxSegNo;
2399 :
2400 3362 : return recycleSegNo;
2401 : }
2402 :
2403 : /*
2404 : * Check whether we've consumed enough xlog space that a checkpoint is needed.
2405 : *
2406 : * new_segno indicates a log file that has just been filled up (or read
2407 : * during recovery). We measure the distance from RedoRecPtr to new_segno
2408 : * and see if that exceeds CheckPointSegments.
2409 : *
2410 : * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2411 : */
2412 : bool
2413 9928 : XLogCheckpointNeeded(XLogSegNo new_segno)
2414 : {
2415 : XLogSegNo old_segno;
2416 :
2417 9928 : XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2418 :
2419 9928 : if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2420 6360 : return true;
2421 3568 : return false;
2422 : }
2423 :
2424 : /*
2425 : * Write and/or fsync the log at least as far as WriteRqst indicates.
2426 : *
2427 : * If flexible == true, we don't have to write as far as WriteRqst, but
2428 : * may stop at any convenient boundary (such as a cache or logfile boundary).
2429 : * This option allows us to avoid uselessly issuing multiple writes when a
2430 : * single one would do.
2431 : *
2432 : * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2433 : * must be called before grabbing the lock, to make sure the data is ready to
2434 : * write.
2435 : */
2436 : static void
2437 4134472 : XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
2438 : {
2439 : bool ispartialpage;
2440 : bool last_iteration;
2441 : bool finishing_seg;
2442 : int curridx;
2443 : int npages;
2444 : int startidx;
2445 : uint32 startoffset;
2446 :
2447 : /* We should always be inside a critical section here */
2448 : Assert(CritSectionCount > 0);
2449 :
2450 : /*
2451 : * Update local LogwrtResult (caller probably did this already, but...)
2452 : */
2453 4134472 : RefreshXLogWriteResult(LogwrtResult);
2454 :
2455 : /*
2456 : * Since successive pages in the xlog cache are consecutively allocated,
2457 : * we can usually gather multiple pages together and issue just one
2458 : * write() call. npages is the number of pages we have determined can be
2459 : * written together; startidx is the cache block index of the first one,
2460 : * and startoffset is the file offset at which it should go. The latter
2461 : * two variables are only valid when npages > 0, but we must initialize
2462 : * all of them to keep the compiler quiet.
2463 : */
2464 4134472 : npages = 0;
2465 4134472 : startidx = 0;
2466 4134472 : startoffset = 0;
2467 :
2468 : /*
2469 : * Within the loop, curridx is the cache block index of the page to
2470 : * consider writing. Begin at the buffer containing the next unwritten
2471 : * page, or last partially written page.
2472 : */
2473 4134472 : curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2474 :
2475 9036778 : while (LogwrtResult.Write < WriteRqst.Write)
2476 : {
2477 : /*
2478 : * Make sure we're not ahead of the insert process. This could happen
2479 : * if we're passed a bogus WriteRqst.Write that is past the end of the
2480 : * last page that's been initialized by AdvanceXLInsertBuffer.
2481 : */
2482 5152064 : XLogRecPtr EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]);
2483 :
2484 5152064 : if (LogwrtResult.Write >= EndPtr)
2485 0 : elog(PANIC, "xlog write request %X/%08X is past end of log %X/%08X",
2486 : LSN_FORMAT_ARGS(LogwrtResult.Write),
2487 : LSN_FORMAT_ARGS(EndPtr));
2488 :
2489 : /* Advance LogwrtResult.Write to end of current buffer page */
2490 5152064 : LogwrtResult.Write = EndPtr;
2491 5152064 : ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2492 :
2493 5152064 : if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2494 : wal_segment_size))
2495 : {
2496 : /*
2497 : * Switch to new logfile segment. We cannot have any pending
2498 : * pages here (since we dump what we have at segment end).
2499 : */
2500 : Assert(npages == 0);
2501 26198 : if (openLogFile >= 0)
2502 11900 : XLogFileClose();
2503 26198 : XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2504 : wal_segment_size);
2505 26198 : openLogTLI = tli;
2506 :
2507 : /* create/use new log file */
2508 26198 : openLogFile = XLogFileInit(openLogSegNo, tli);
2509 26198 : ReserveExternalFD();
2510 : }
2511 :
2512 : /* Make sure we have the current logfile open */
2513 5152064 : if (openLogFile < 0)
2514 : {
2515 0 : XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2516 : wal_segment_size);
2517 0 : openLogTLI = tli;
2518 0 : openLogFile = XLogFileOpen(openLogSegNo, tli);
2519 0 : ReserveExternalFD();
2520 : }
2521 :
2522 : /* Add current page to the set of pending pages-to-dump */
2523 5152064 : if (npages == 0)
2524 : {
2525 : /* first of group */
2526 4170222 : startidx = curridx;
2527 4170222 : startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2528 : wal_segment_size);
2529 : }
2530 5152064 : npages++;
2531 :
2532 : /*
2533 : * Dump the set if this will be the last loop iteration, or if we are
2534 : * at the last page of the cache area (since the next page won't be
2535 : * contiguous in memory), or if we are at the end of the logfile
2536 : * segment.
2537 : */
2538 5152064 : last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2539 :
2540 10061546 : finishing_seg = !ispartialpage &&
2541 4909482 : (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2542 :
2543 5152064 : if (last_iteration ||
2544 1018984 : curridx == XLogCtl->XLogCacheBlck ||
2545 : finishing_seg)
2546 : {
2547 : char *from;
2548 : Size nbytes;
2549 : Size nleft;
2550 : ssize_t written;
2551 : instr_time start;
2552 :
2553 : /* OK to write the page(s) */
2554 4170222 : from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2555 4170222 : nbytes = npages * (Size) XLOG_BLCKSZ;
2556 4170222 : nleft = nbytes;
2557 : do
2558 : {
2559 4170222 : errno = 0;
2560 :
2561 : /*
2562 : * Measure I/O timing to write WAL data, for pg_stat_io.
2563 : */
2564 4170222 : start = pgstat_prepare_io_time(track_wal_io_timing);
2565 :
2566 4170222 : pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2567 4170222 : written = pg_pwrite(openLogFile, from, nleft, startoffset);
2568 4170222 : pgstat_report_wait_end();
2569 :
2570 4170222 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL,
2571 : IOOP_WRITE, start, 1, written);
2572 :
2573 4170222 : if (written <= 0)
2574 : {
2575 : char xlogfname[MAXFNAMELEN];
2576 : int save_errno;
2577 :
2578 0 : if (errno == EINTR)
2579 0 : continue;
2580 :
2581 0 : save_errno = errno;
2582 0 : XLogFileName(xlogfname, tli, openLogSegNo,
2583 : wal_segment_size);
2584 0 : errno = save_errno;
2585 0 : ereport(PANIC,
2586 : (errcode_for_file_access(),
2587 : errmsg("could not write to log file \"%s\" at offset %u, length %zu: %m",
2588 : xlogfname, startoffset, nleft)));
2589 : }
2590 4170222 : nleft -= written;
2591 4170222 : from += written;
2592 4170222 : startoffset += written;
2593 4170222 : } while (nleft > 0);
2594 :
2595 4170222 : npages = 0;
2596 :
2597 : /*
2598 : * If we just wrote the whole last page of a logfile segment,
2599 : * fsync the segment immediately. This avoids having to go back
2600 : * and re-open prior segments when an fsync request comes along
2601 : * later. Doing it here ensures that one and only one backend will
2602 : * perform this fsync.
2603 : *
2604 : * This is also the right place to notify the Archiver that the
2605 : * segment is ready to copy to archival storage, and to update the
2606 : * timer for archive_timeout, and to signal for a checkpoint if
2607 : * too many logfile segments have been used since the last
2608 : * checkpoint.
2609 : */
2610 4170222 : if (finishing_seg)
2611 : {
2612 3814 : issue_xlog_fsync(openLogFile, openLogSegNo, tli);
2613 :
2614 : /* signal that we need to wakeup walsenders later */
2615 3814 : WalSndWakeupRequest();
2616 :
2617 3814 : LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
2618 :
2619 3814 : if (XLogArchivingActive())
2620 810 : XLogArchiveNotifySeg(openLogSegNo, tli);
2621 :
2622 3814 : XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2623 3814 : XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2624 :
2625 : /*
2626 : * Request a checkpoint if we've consumed too much xlog since
2627 : * the last one. For speed, we first check using the local
2628 : * copy of RedoRecPtr, which might be out of date; if it looks
2629 : * like a checkpoint is needed, forcibly update RedoRecPtr and
2630 : * recheck.
2631 : */
2632 3814 : if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2633 : {
2634 476 : (void) GetRedoRecPtr();
2635 476 : if (XLogCheckpointNeeded(openLogSegNo))
2636 384 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2637 : }
2638 : }
2639 : }
2640 :
2641 5152064 : if (ispartialpage)
2642 : {
2643 : /* Only asked to write a partial page */
2644 242582 : LogwrtResult.Write = WriteRqst.Write;
2645 242582 : break;
2646 : }
2647 4909482 : curridx = NextBufIdx(curridx);
2648 :
2649 : /* If flexible, break out of loop as soon as we wrote something */
2650 4909482 : if (flexible && npages == 0)
2651 7176 : break;
2652 : }
2653 :
2654 : Assert(npages == 0);
2655 :
2656 : /*
2657 : * If asked to flush, do so
2658 : */
2659 4134472 : if (LogwrtResult.Flush < WriteRqst.Flush &&
2660 257834 : LogwrtResult.Flush < LogwrtResult.Write)
2661 : {
2662 : /*
2663 : * Could get here without iterating above loop, in which case we might
2664 : * have no open file or the wrong one. However, we do not need to
2665 : * fsync more than one file.
2666 : */
2667 257692 : if (wal_sync_method != WAL_SYNC_METHOD_OPEN &&
2668 257692 : wal_sync_method != WAL_SYNC_METHOD_OPEN_DSYNC)
2669 : {
2670 257692 : if (openLogFile >= 0 &&
2671 257676 : !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2672 : wal_segment_size))
2673 48 : XLogFileClose();
2674 257692 : if (openLogFile < 0)
2675 : {
2676 64 : XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2677 : wal_segment_size);
2678 64 : openLogTLI = tli;
2679 64 : openLogFile = XLogFileOpen(openLogSegNo, tli);
2680 64 : ReserveExternalFD();
2681 : }
2682 :
2683 257692 : issue_xlog_fsync(openLogFile, openLogSegNo, tli);
2684 : }
2685 :
2686 : /* signal that we need to wakeup walsenders later */
2687 257692 : WalSndWakeupRequest();
2688 :
2689 257692 : LogwrtResult.Flush = LogwrtResult.Write;
2690 : }
2691 :
2692 : /*
2693 : * Update shared-memory status
2694 : *
2695 : * We make sure that the shared 'request' values do not fall behind the
2696 : * 'result' values. This is not absolutely essential, but it saves some
2697 : * code in a couple of places.
2698 : */
2699 4134472 : SpinLockAcquire(&XLogCtl->info_lck);
2700 4134472 : if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2701 225148 : XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2702 4134472 : if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2703 260706 : XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2704 4134472 : SpinLockRelease(&XLogCtl->info_lck);
2705 :
2706 : /*
2707 : * We write Write first, bar, then Flush. When reading, the opposite must
2708 : * be done (with a matching barrier in between), so that we always see a
2709 : * Flush value that trails behind the Write value seen.
2710 : */
2711 4134472 : pg_atomic_write_u64(&XLogCtl->logWriteResult, LogwrtResult.Write);
2712 4134472 : pg_write_barrier();
2713 4134472 : pg_atomic_write_u64(&XLogCtl->logFlushResult, LogwrtResult.Flush);
2714 :
2715 : #ifdef USE_ASSERT_CHECKING
2716 : {
2717 : XLogRecPtr Flush;
2718 : XLogRecPtr Write;
2719 : XLogRecPtr Insert;
2720 :
2721 : Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult);
2722 : pg_read_barrier();
2723 : Write = pg_atomic_read_u64(&XLogCtl->logWriteResult);
2724 : pg_read_barrier();
2725 : Insert = pg_atomic_read_u64(&XLogCtl->logInsertResult);
2726 :
2727 : /* WAL written to disk is always ahead of WAL flushed */
2728 : Assert(Write >= Flush);
2729 :
2730 : /* WAL inserted to buffers is always ahead of WAL written */
2731 : Assert(Insert >= Write);
2732 : }
2733 : #endif
2734 4134472 : }
2735 :
2736 : /*
2737 : * Record the LSN for an asynchronous transaction commit/abort
2738 : * and nudge the WALWriter if there is work for it to do.
2739 : * (This should not be called for synchronous commits.)
2740 : */
2741 : void
2742 94694 : XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2743 : {
2744 94694 : XLogRecPtr WriteRqstPtr = asyncXactLSN;
2745 : bool sleeping;
2746 94694 : bool wakeup = false;
2747 : XLogRecPtr prevAsyncXactLSN;
2748 :
2749 94694 : SpinLockAcquire(&XLogCtl->info_lck);
2750 94694 : sleeping = XLogCtl->WalWriterSleeping;
2751 94694 : prevAsyncXactLSN = XLogCtl->asyncXactLSN;
2752 94694 : if (XLogCtl->asyncXactLSN < asyncXactLSN)
2753 93782 : XLogCtl->asyncXactLSN = asyncXactLSN;
2754 94694 : SpinLockRelease(&XLogCtl->info_lck);
2755 :
2756 : /*
2757 : * If somebody else already called this function with a more aggressive
2758 : * LSN, they will have done what we needed (and perhaps more).
2759 : */
2760 94694 : if (asyncXactLSN <= prevAsyncXactLSN)
2761 912 : return;
2762 :
2763 : /*
2764 : * If the WALWriter is sleeping, kick it to make it come out of low-power
2765 : * mode, so that this async commit will reach disk within the expected
2766 : * amount of time. Otherwise, determine whether it has enough WAL
2767 : * available to flush, the same way that XLogBackgroundFlush() does.
2768 : */
2769 93782 : if (sleeping)
2770 40 : wakeup = true;
2771 : else
2772 : {
2773 : int flushblocks;
2774 :
2775 93742 : RefreshXLogWriteResult(LogwrtResult);
2776 :
2777 93742 : flushblocks =
2778 93742 : WriteRqstPtr / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
2779 :
2780 93742 : if (WalWriterFlushAfter == 0 || flushblocks >= WalWriterFlushAfter)
2781 7380 : wakeup = true;
2782 : }
2783 :
2784 93782 : if (wakeup)
2785 : {
2786 7420 : volatile PROC_HDR *procglobal = ProcGlobal;
2787 7420 : ProcNumber walwriterProc = procglobal->walwriterProc;
2788 :
2789 7420 : if (walwriterProc != INVALID_PROC_NUMBER)
2790 376 : SetLatch(&GetPGProcByNumber(walwriterProc)->procLatch);
2791 : }
2792 : }
2793 :
2794 : /*
2795 : * Record the LSN up to which we can remove WAL because it's not required by
2796 : * any replication slot.
2797 : */
2798 : void
2799 47816 : XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2800 : {
2801 47816 : SpinLockAcquire(&XLogCtl->info_lck);
2802 47816 : XLogCtl->replicationSlotMinLSN = lsn;
2803 47816 : SpinLockRelease(&XLogCtl->info_lck);
2804 47816 : }
2805 :
2806 :
2807 : /*
2808 : * Return the oldest LSN we must retain to satisfy the needs of some
2809 : * replication slot.
2810 : */
2811 : static XLogRecPtr
2812 4280 : XLogGetReplicationSlotMinimumLSN(void)
2813 : {
2814 : XLogRecPtr retval;
2815 :
2816 4280 : SpinLockAcquire(&XLogCtl->info_lck);
2817 4280 : retval = XLogCtl->replicationSlotMinLSN;
2818 4280 : SpinLockRelease(&XLogCtl->info_lck);
2819 :
2820 4280 : return retval;
2821 : }
2822 :
2823 : /*
2824 : * Advance minRecoveryPoint in control file.
2825 : *
2826 : * If we crash during recovery, we must reach this point again before the
2827 : * database is consistent.
2828 : *
2829 : * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2830 : * is only updated if it's not already greater than or equal to 'lsn'.
2831 : */
2832 : static void
2833 208226 : UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2834 : {
2835 : /* Quick check using our local copy of the variable */
2836 208226 : if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
2837 195300 : return;
2838 :
2839 : /*
2840 : * An invalid minRecoveryPoint means that we need to recover all the WAL,
2841 : * i.e., we're doing crash recovery. We never modify the control file's
2842 : * value in that case, so we can short-circuit future checks here too. The
2843 : * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2844 : * updated until crash recovery finishes. We only do this for the startup
2845 : * process as it should not update its own reference of minRecoveryPoint
2846 : * until it has finished crash recovery to make sure that all WAL
2847 : * available is replayed in this case. This also saves from extra locks
2848 : * taken on the control file from the startup process.
2849 : */
2850 12926 : if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
2851 : {
2852 60 : updateMinRecoveryPoint = false;
2853 60 : return;
2854 : }
2855 :
2856 12866 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2857 :
2858 : /* update local copy */
2859 12866 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
2860 12866 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2861 :
2862 12866 : if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
2863 4 : updateMinRecoveryPoint = false;
2864 12862 : else if (force || LocalMinRecoveryPoint < lsn)
2865 : {
2866 : XLogRecPtr newMinRecoveryPoint;
2867 : TimeLineID newMinRecoveryPointTLI;
2868 :
2869 : /*
2870 : * To avoid having to update the control file too often, we update it
2871 : * all the way to the last record being replayed, even though 'lsn'
2872 : * would suffice for correctness. This also allows the 'force' case
2873 : * to not need a valid 'lsn' value.
2874 : *
2875 : * Another important reason for doing it this way is that the passed
2876 : * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2877 : * the caller got it from a corrupted heap page. Accepting such a
2878 : * value as the min recovery point would prevent us from coming up at
2879 : * all. Instead, we just log a warning and continue with recovery.
2880 : * (See also the comments about corrupt LSNs in XLogFlush.)
2881 : */
2882 10630 : newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
2883 10630 : if (!force && newMinRecoveryPoint < lsn)
2884 0 : elog(WARNING,
2885 : "xlog min recovery request %X/%08X is past current point %X/%08X",
2886 : LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
2887 :
2888 : /* update control file */
2889 10630 : if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2890 : {
2891 9936 : ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2892 9936 : ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2893 9936 : UpdateControlFile();
2894 9936 : LocalMinRecoveryPoint = newMinRecoveryPoint;
2895 9936 : LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
2896 :
2897 9936 : ereport(DEBUG2,
2898 : errmsg_internal("updated min recovery point to %X/%08X on timeline %u",
2899 : LSN_FORMAT_ARGS(newMinRecoveryPoint),
2900 : newMinRecoveryPointTLI));
2901 : }
2902 : }
2903 12866 : LWLockRelease(ControlFileLock);
2904 : }
2905 :
2906 : /*
2907 : * Ensure that all XLOG data through the given position is flushed to disk.
2908 : *
2909 : * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2910 : * already held, and we try to avoid acquiring it if possible.
2911 : */
2912 : void
2913 1351384 : XLogFlush(XLogRecPtr record)
2914 : {
2915 : XLogRecPtr WriteRqstPtr;
2916 : XLogwrtRqst WriteRqst;
2917 1351384 : TimeLineID insertTLI = XLogCtl->InsertTimeLineID;
2918 :
2919 : /*
2920 : * During REDO, we are reading not writing WAL. Therefore, instead of
2921 : * trying to flush the WAL, we should update minRecoveryPoint instead. We
2922 : * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2923 : * to act this way too, and because when it tries to write the
2924 : * end-of-recovery checkpoint, it should indeed flush.
2925 : */
2926 1351384 : if (!XLogInsertAllowed())
2927 : {
2928 207334 : UpdateMinRecoveryPoint(record, false);
2929 1075846 : return;
2930 : }
2931 :
2932 : /* Quick exit if already known flushed */
2933 1144050 : if (record <= LogwrtResult.Flush)
2934 868512 : return;
2935 :
2936 : #ifdef WAL_DEBUG
2937 : if (XLOG_DEBUG)
2938 : elog(LOG, "xlog flush request %X/%08X; write %X/%08X; flush %X/%08X",
2939 : LSN_FORMAT_ARGS(record),
2940 : LSN_FORMAT_ARGS(LogwrtResult.Write),
2941 : LSN_FORMAT_ARGS(LogwrtResult.Flush));
2942 : #endif
2943 :
2944 275538 : START_CRIT_SECTION();
2945 :
2946 : /*
2947 : * Since fsync is usually a horribly expensive operation, we try to
2948 : * piggyback as much data as we can on each fsync: if we see any more data
2949 : * entered into the xlog buffer, we'll write and fsync that too, so that
2950 : * the final value of LogwrtResult.Flush is as large as possible. This
2951 : * gives us some chance of avoiding another fsync immediately after.
2952 : */
2953 :
2954 : /* initialize to given target; may increase below */
2955 275538 : WriteRqstPtr = record;
2956 :
2957 : /*
2958 : * Now wait until we get the write lock, or someone else does the flush
2959 : * for us.
2960 : */
2961 : for (;;)
2962 10350 : {
2963 : XLogRecPtr insertpos;
2964 :
2965 : /* done already? */
2966 285888 : RefreshXLogWriteResult(LogwrtResult);
2967 285888 : if (record <= LogwrtResult.Flush)
2968 21886 : break;
2969 :
2970 : /*
2971 : * Before actually performing the write, wait for all in-flight
2972 : * insertions to the pages we're about to write to finish.
2973 : */
2974 264002 : SpinLockAcquire(&XLogCtl->info_lck);
2975 264002 : if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2976 18428 : WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2977 264002 : SpinLockRelease(&XLogCtl->info_lck);
2978 264002 : insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2979 :
2980 : /*
2981 : * Try to get the write lock. If we can't get it immediately, wait
2982 : * until it's released, and recheck if we still need to do the flush
2983 : * or if the backend that held the lock did it for us already. This
2984 : * helps to maintain a good rate of group committing when the system
2985 : * is bottlenecked by the speed of fsyncing.
2986 : */
2987 264002 : if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2988 : {
2989 : /*
2990 : * The lock is now free, but we didn't acquire it yet. Before we
2991 : * do, loop back to check if someone else flushed the record for
2992 : * us already.
2993 : */
2994 10350 : continue;
2995 : }
2996 :
2997 : /* Got the lock; recheck whether request is satisfied */
2998 253652 : RefreshXLogWriteResult(LogwrtResult);
2999 253652 : if (record <= LogwrtResult.Flush)
3000 : {
3001 2844 : LWLockRelease(WALWriteLock);
3002 2844 : break;
3003 : }
3004 :
3005 : /*
3006 : * Sleep before flush! By adding a delay here, we may give further
3007 : * backends the opportunity to join the backlog of group commit
3008 : * followers; this can significantly improve transaction throughput,
3009 : * at the risk of increasing transaction latency.
3010 : *
3011 : * We do not sleep if enableFsync is not turned on, nor if there are
3012 : * fewer than CommitSiblings other backends with active transactions.
3013 : */
3014 250808 : if (CommitDelay > 0 && enableFsync &&
3015 0 : MinimumActiveBackends(CommitSiblings))
3016 : {
3017 0 : pg_usleep(CommitDelay);
3018 :
3019 : /*
3020 : * Re-check how far we can now flush the WAL. It's generally not
3021 : * safe to call WaitXLogInsertionsToFinish while holding
3022 : * WALWriteLock, because an in-progress insertion might need to
3023 : * also grab WALWriteLock to make progress. But we know that all
3024 : * the insertions up to insertpos have already finished, because
3025 : * that's what the earlier WaitXLogInsertionsToFinish() returned.
3026 : * We're only calling it again to allow insertpos to be moved
3027 : * further forward, not to actually wait for anyone.
3028 : */
3029 0 : insertpos = WaitXLogInsertionsToFinish(insertpos);
3030 : }
3031 :
3032 : /* try to write/flush later additions to XLOG as well */
3033 250808 : WriteRqst.Write = insertpos;
3034 250808 : WriteRqst.Flush = insertpos;
3035 :
3036 250808 : XLogWrite(WriteRqst, insertTLI, false);
3037 :
3038 250808 : LWLockRelease(WALWriteLock);
3039 : /* done */
3040 250808 : break;
3041 : }
3042 :
3043 275538 : END_CRIT_SECTION();
3044 :
3045 : /* wake up walsenders now that we've released heavily contended locks */
3046 275538 : WalSndWakeupProcessRequests(true, !RecoveryInProgress());
3047 :
3048 : /*
3049 : * If we still haven't flushed to the request point then we have a
3050 : * problem; most likely, the requested flush point is past end of XLOG.
3051 : * This has been seen to occur when a disk page has a corrupted LSN.
3052 : *
3053 : * Formerly we treated this as a PANIC condition, but that hurts the
3054 : * system's robustness rather than helping it: we do not want to take down
3055 : * the whole system due to corruption on one data page. In particular, if
3056 : * the bad page is encountered again during recovery then we would be
3057 : * unable to restart the database at all! (This scenario actually
3058 : * happened in the field several times with 7.1 releases.) As of 8.4, bad
3059 : * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3060 : * the only time we can reach here during recovery is while flushing the
3061 : * end-of-recovery checkpoint record, and we don't expect that to have a
3062 : * bad LSN.
3063 : *
3064 : * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3065 : * since xact.c calls this routine inside a critical section. However,
3066 : * calls from bufmgr.c are not within critical sections and so we will not
3067 : * force a restart for a bad LSN on a data page.
3068 : */
3069 275538 : if (LogwrtResult.Flush < record)
3070 0 : elog(ERROR,
3071 : "xlog flush request %X/%08X is not satisfied --- flushed only to %X/%08X",
3072 : LSN_FORMAT_ARGS(record),
3073 : LSN_FORMAT_ARGS(LogwrtResult.Flush));
3074 : }
3075 :
3076 : /*
3077 : * Write & flush xlog, but without specifying exactly where to.
3078 : *
3079 : * We normally write only completed blocks; but if there is nothing to do on
3080 : * that basis, we check for unwritten async commits in the current incomplete
3081 : * block, and write through the latest one of those. Thus, if async commits
3082 : * are not being used, we will write complete blocks only.
3083 : *
3084 : * If, based on the above, there's anything to write we do so immediately. But
3085 : * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
3086 : * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
3087 : * more than wal_writer_flush_after unflushed blocks.
3088 : *
3089 : * We can guarantee that async commits reach disk after at most three
3090 : * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
3091 : * to write "flexibly", meaning it can stop at the end of the buffer ring;
3092 : * this makes a difference only with very high load or long wal_writer_delay,
3093 : * but imposes one extra cycle for the worst case for async commits.)
3094 : *
3095 : * This routine is invoked periodically by the background walwriter process.
3096 : *
3097 : * Returns true if there was any work to do, even if we skipped flushing due
3098 : * to wal_writer_delay/wal_writer_flush_after.
3099 : */
3100 : bool
3101 36370 : XLogBackgroundFlush(void)
3102 : {
3103 : XLogwrtRqst WriteRqst;
3104 36370 : bool flexible = true;
3105 : static TimestampTz lastflush;
3106 : TimestampTz now;
3107 : int flushblocks;
3108 : TimeLineID insertTLI;
3109 :
3110 : /* XLOG doesn't need flushing during recovery */
3111 36370 : if (RecoveryInProgress())
3112 16 : return false;
3113 :
3114 : /*
3115 : * Since we're not in recovery, InsertTimeLineID is set and can't change,
3116 : * so we can read it without a lock.
3117 : */
3118 36354 : insertTLI = XLogCtl->InsertTimeLineID;
3119 :
3120 : /* read updated LogwrtRqst */
3121 36354 : SpinLockAcquire(&XLogCtl->info_lck);
3122 36354 : WriteRqst = XLogCtl->LogwrtRqst;
3123 36354 : SpinLockRelease(&XLogCtl->info_lck);
3124 :
3125 : /* back off to last completed page boundary */
3126 36354 : WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3127 :
3128 : /* if we have already flushed that far, consider async commit records */
3129 36354 : RefreshXLogWriteResult(LogwrtResult);
3130 36354 : if (WriteRqst.Write <= LogwrtResult.Flush)
3131 : {
3132 28832 : SpinLockAcquire(&XLogCtl->info_lck);
3133 28832 : WriteRqst.Write = XLogCtl->asyncXactLSN;
3134 28832 : SpinLockRelease(&XLogCtl->info_lck);
3135 28832 : flexible = false; /* ensure it all gets written */
3136 : }
3137 :
3138 : /*
3139 : * If already known flushed, we're done. Just need to check if we are
3140 : * holding an open file handle to a logfile that's no longer in use,
3141 : * preventing the file from being deleted.
3142 : */
3143 36354 : if (WriteRqst.Write <= LogwrtResult.Flush)
3144 : {
3145 27498 : if (openLogFile >= 0)
3146 : {
3147 13866 : if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3148 : wal_segment_size))
3149 : {
3150 268 : XLogFileClose();
3151 : }
3152 : }
3153 27498 : return false;
3154 : }
3155 :
3156 : /*
3157 : * Determine how far to flush WAL, based on the wal_writer_delay and
3158 : * wal_writer_flush_after GUCs.
3159 : *
3160 : * Note that XLogSetAsyncXactLSN() performs similar calculation based on
3161 : * wal_writer_flush_after, to decide when to wake us up. Make sure the
3162 : * logic is the same in both places if you change this.
3163 : */
3164 8856 : now = GetCurrentTimestamp();
3165 8856 : flushblocks =
3166 8856 : WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3167 :
3168 8856 : if (WalWriterFlushAfter == 0 || lastflush == 0)
3169 : {
3170 : /* first call, or block based limits disabled */
3171 460 : WriteRqst.Flush = WriteRqst.Write;
3172 460 : lastflush = now;
3173 : }
3174 8396 : else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3175 : {
3176 : /*
3177 : * Flush the writes at least every WalWriterDelay ms. This is
3178 : * important to bound the amount of time it takes for an asynchronous
3179 : * commit to hit disk.
3180 : */
3181 8120 : WriteRqst.Flush = WriteRqst.Write;
3182 8120 : lastflush = now;
3183 : }
3184 276 : else if (flushblocks >= WalWriterFlushAfter)
3185 : {
3186 : /* exceeded wal_writer_flush_after blocks, flush */
3187 258 : WriteRqst.Flush = WriteRqst.Write;
3188 258 : lastflush = now;
3189 : }
3190 : else
3191 : {
3192 : /* no flushing, this time round */
3193 18 : WriteRqst.Flush = 0;
3194 : }
3195 :
3196 : #ifdef WAL_DEBUG
3197 : if (XLOG_DEBUG)
3198 : elog(LOG, "xlog bg flush request write %X/%08X; flush: %X/%08X, current is write %X/%08X; flush %X/%08X",
3199 : LSN_FORMAT_ARGS(WriteRqst.Write),
3200 : LSN_FORMAT_ARGS(WriteRqst.Flush),
3201 : LSN_FORMAT_ARGS(LogwrtResult.Write),
3202 : LSN_FORMAT_ARGS(LogwrtResult.Flush));
3203 : #endif
3204 :
3205 8856 : START_CRIT_SECTION();
3206 :
3207 : /* now wait for any in-progress insertions to finish and get write lock */
3208 8856 : WaitXLogInsertionsToFinish(WriteRqst.Write);
3209 8856 : LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3210 8856 : RefreshXLogWriteResult(LogwrtResult);
3211 8856 : if (WriteRqst.Write > LogwrtResult.Write ||
3212 420 : WriteRqst.Flush > LogwrtResult.Flush)
3213 : {
3214 8492 : XLogWrite(WriteRqst, insertTLI, flexible);
3215 : }
3216 8856 : LWLockRelease(WALWriteLock);
3217 :
3218 8856 : END_CRIT_SECTION();
3219 :
3220 : /* wake up walsenders now that we've released heavily contended locks */
3221 8856 : WalSndWakeupProcessRequests(true, !RecoveryInProgress());
3222 :
3223 : /*
3224 : * Great, done. To take some work off the critical path, try to initialize
3225 : * as many of the no-longer-needed WAL buffers for future use as we can.
3226 : */
3227 8856 : AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
3228 :
3229 : /*
3230 : * If we determined that we need to write data, but somebody else
3231 : * wrote/flushed already, it should be considered as being active, to
3232 : * avoid hibernating too early.
3233 : */
3234 8856 : return true;
3235 : }
3236 :
3237 : /*
3238 : * Test whether XLOG data has been flushed up to (at least) the given position.
3239 : *
3240 : * Returns true if a flush is still needed. (It may be that someone else
3241 : * is already in process of flushing that far, however.)
3242 : */
3243 : bool
3244 17402854 : XLogNeedsFlush(XLogRecPtr record)
3245 : {
3246 : /*
3247 : * During recovery, we don't flush WAL but update minRecoveryPoint
3248 : * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3249 : * would need to be updated.
3250 : */
3251 17402854 : if (RecoveryInProgress())
3252 : {
3253 : /*
3254 : * An invalid minRecoveryPoint means that we need to recover all the
3255 : * WAL, i.e., we're doing crash recovery. We never modify the control
3256 : * file's value in that case, so we can short-circuit future checks
3257 : * here too. This triggers a quick exit path for the startup process,
3258 : * which cannot update its local copy of minRecoveryPoint as long as
3259 : * it has not replayed all WAL available when doing crash recovery.
3260 : */
3261 1273858 : if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
3262 0 : updateMinRecoveryPoint = false;
3263 :
3264 : /* Quick exit if already known to be updated or cannot be updated */
3265 1273858 : if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
3266 1250976 : return false;
3267 :
3268 : /*
3269 : * Update local copy of minRecoveryPoint. But if the lock is busy,
3270 : * just return a conservative guess.
3271 : */
3272 22882 : if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3273 0 : return true;
3274 22882 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
3275 22882 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3276 22882 : LWLockRelease(ControlFileLock);
3277 :
3278 : /*
3279 : * Check minRecoveryPoint for any other process than the startup
3280 : * process doing crash recovery, which should not update the control
3281 : * file value if crash recovery is still running.
3282 : */
3283 22882 : if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
3284 0 : updateMinRecoveryPoint = false;
3285 :
3286 : /* check again */
3287 22882 : if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
3288 160 : return false;
3289 : else
3290 22722 : return true;
3291 : }
3292 :
3293 : /* Quick exit if already known flushed */
3294 16128996 : if (record <= LogwrtResult.Flush)
3295 15689822 : return false;
3296 :
3297 : /* read LogwrtResult and update local state */
3298 439174 : RefreshXLogWriteResult(LogwrtResult);
3299 :
3300 : /* check again */
3301 439174 : if (record <= LogwrtResult.Flush)
3302 6320 : return false;
3303 :
3304 432854 : return true;
3305 : }
3306 :
3307 : /*
3308 : * Try to make a given XLOG file segment exist.
3309 : *
3310 : * logsegno: identify segment.
3311 : *
3312 : * *added: on return, true if this call raised the number of extant segments.
3313 : *
3314 : * path: on return, this char[MAXPGPATH] has the path to the logsegno file.
3315 : *
3316 : * Returns -1 or FD of opened file. A -1 here is not an error; a caller
3317 : * wanting an open segment should attempt to open "path", which usually will
3318 : * succeed. (This is weird, but it's efficient for the callers.)
3319 : */
3320 : static int
3321 28432 : XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
3322 : bool *added, char *path)
3323 : {
3324 : char tmppath[MAXPGPATH];
3325 : XLogSegNo installed_segno;
3326 : XLogSegNo max_segno;
3327 : int fd;
3328 : int save_errno;
3329 28432 : int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
3330 : instr_time io_start;
3331 :
3332 : Assert(logtli != 0);
3333 :
3334 28432 : XLogFilePath(path, logtli, logsegno, wal_segment_size);
3335 :
3336 : /*
3337 : * Try to use existent file (checkpoint maker may have created it already)
3338 : */
3339 28432 : *added = false;
3340 28432 : fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
3341 28432 : get_sync_bit(wal_sync_method));
3342 28432 : if (fd < 0)
3343 : {
3344 2956 : if (errno != ENOENT)
3345 0 : ereport(ERROR,
3346 : (errcode_for_file_access(),
3347 : errmsg("could not open file \"%s\": %m", path)));
3348 : }
3349 : else
3350 25476 : return fd;
3351 :
3352 : /*
3353 : * Initialize an empty (all zeroes) segment. NOTE: it is possible that
3354 : * another process is doing the same thing. If so, we will end up
3355 : * pre-creating an extra log segment. That seems OK, and better than
3356 : * holding the lock throughout this lengthy process.
3357 : */
3358 2956 : elog(DEBUG2, "creating and filling new WAL file");
3359 :
3360 2956 : snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3361 :
3362 2956 : unlink(tmppath);
3363 :
3364 2956 : if (io_direct_flags & IO_DIRECT_WAL_INIT)
3365 0 : open_flags |= PG_O_DIRECT;
3366 :
3367 : /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3368 2956 : fd = BasicOpenFile(tmppath, open_flags);
3369 2956 : if (fd < 0)
3370 0 : ereport(ERROR,
3371 : (errcode_for_file_access(),
3372 : errmsg("could not create file \"%s\": %m", tmppath)));
3373 :
3374 : /* Measure I/O timing when initializing segment */
3375 2956 : io_start = pgstat_prepare_io_time(track_wal_io_timing);
3376 :
3377 2956 : pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3378 2956 : save_errno = 0;
3379 2956 : if (wal_init_zero)
3380 : {
3381 : ssize_t rc;
3382 :
3383 : /*
3384 : * Zero-fill the file. With this setting, we do this the hard way to
3385 : * ensure that all the file space has really been allocated. On
3386 : * platforms that allow "holes" in files, just seeking to the end
3387 : * doesn't allocate intermediate space. This way, we know that we
3388 : * have all the space and (after the fsync below) that all the
3389 : * indirect blocks are down on disk. Therefore, fdatasync(2) or
3390 : * O_DSYNC will be sufficient to sync future writes to the log file.
3391 : */
3392 2956 : rc = pg_pwrite_zeros(fd, wal_segment_size, 0);
3393 :
3394 2956 : if (rc < 0)
3395 0 : save_errno = errno;
3396 : }
3397 : else
3398 : {
3399 : /*
3400 : * Otherwise, seeking to the end and writing a solitary byte is
3401 : * enough.
3402 : */
3403 0 : errno = 0;
3404 0 : if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
3405 : {
3406 : /* if write didn't set errno, assume no disk space */
3407 0 : save_errno = errno ? errno : ENOSPC;
3408 : }
3409 : }
3410 2956 : pgstat_report_wait_end();
3411 :
3412 : /*
3413 : * A full segment worth of data is written when using wal_init_zero. One
3414 : * byte is written when not using it.
3415 : */
3416 2956 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE,
3417 : io_start, 1,
3418 2956 : wal_init_zero ? wal_segment_size : 1);
3419 :
3420 2956 : if (save_errno)
3421 : {
3422 : /*
3423 : * If we fail to make the file, delete it to release disk space
3424 : */
3425 0 : unlink(tmppath);
3426 :
3427 0 : close(fd);
3428 :
3429 0 : errno = save_errno;
3430 :
3431 0 : ereport(ERROR,
3432 : (errcode_for_file_access(),
3433 : errmsg("could not write to file \"%s\": %m", tmppath)));
3434 : }
3435 :
3436 : /* Measure I/O timing when flushing segment */
3437 2956 : io_start = pgstat_prepare_io_time(track_wal_io_timing);
3438 :
3439 2956 : pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3440 2956 : if (pg_fsync(fd) != 0)
3441 : {
3442 0 : save_errno = errno;
3443 0 : close(fd);
3444 0 : errno = save_errno;
3445 0 : ereport(ERROR,
3446 : (errcode_for_file_access(),
3447 : errmsg("could not fsync file \"%s\": %m", tmppath)));
3448 : }
3449 2956 : pgstat_report_wait_end();
3450 :
3451 2956 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT,
3452 : IOOP_FSYNC, io_start, 1, 0);
3453 :
3454 2956 : if (close(fd) != 0)
3455 0 : ereport(ERROR,
3456 : (errcode_for_file_access(),
3457 : errmsg("could not close file \"%s\": %m", tmppath)));
3458 :
3459 : /*
3460 : * Now move the segment into place with its final name. Cope with
3461 : * possibility that someone else has created the file while we were
3462 : * filling ours: if so, use ours to pre-create a future log segment.
3463 : */
3464 2956 : installed_segno = logsegno;
3465 :
3466 : /*
3467 : * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3468 : * that was a constant, but that was always a bit dubious: normally, at a
3469 : * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3470 : * here, it was the offset from the insert location. We can't do the
3471 : * normal XLOGfileslop calculation here because we don't have access to
3472 : * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3473 : * CheckPointSegments.
3474 : */
3475 2956 : max_segno = logsegno + CheckPointSegments;
3476 2956 : if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
3477 : logtli))
3478 : {
3479 2956 : *added = true;
3480 2956 : elog(DEBUG2, "done creating and filling new WAL file");
3481 : }
3482 : else
3483 : {
3484 : /*
3485 : * No need for any more future segments, or InstallXLogFileSegment()
3486 : * failed to rename the file into place. If the rename failed, a
3487 : * caller opening the file may fail.
3488 : */
3489 0 : unlink(tmppath);
3490 0 : elog(DEBUG2, "abandoned new WAL file");
3491 : }
3492 :
3493 2956 : return -1;
3494 : }
3495 :
3496 : /*
3497 : * Create a new XLOG file segment, or open a pre-existing one.
3498 : *
3499 : * logsegno: identify segment to be created/opened.
3500 : *
3501 : * Returns FD of opened file.
3502 : *
3503 : * Note: errors here are ERROR not PANIC because we might or might not be
3504 : * inside a critical section (eg, during checkpoint there is no reason to
3505 : * take down the system on failure). They will promote to PANIC if we are
3506 : * in a critical section.
3507 : */
3508 : int
3509 27958 : XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
3510 : {
3511 : bool ignore_added;
3512 : char path[MAXPGPATH];
3513 : int fd;
3514 :
3515 : Assert(logtli != 0);
3516 :
3517 27958 : fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
3518 27958 : if (fd >= 0)
3519 25262 : return fd;
3520 :
3521 : /* Now open original target segment (might not be file I just made) */
3522 2696 : fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
3523 2696 : get_sync_bit(wal_sync_method));
3524 2696 : if (fd < 0)
3525 0 : ereport(ERROR,
3526 : (errcode_for_file_access(),
3527 : errmsg("could not open file \"%s\": %m", path)));
3528 2696 : return fd;
3529 : }
3530 :
3531 : /*
3532 : * Create a new XLOG file segment by copying a pre-existing one.
3533 : *
3534 : * destsegno: identify segment to be created.
3535 : *
3536 : * srcTLI, srcsegno: identify segment to be copied (could be from
3537 : * a different timeline)
3538 : *
3539 : * upto: how much of the source file to copy (the rest is filled with
3540 : * zeros)
3541 : *
3542 : * Currently this is only used during recovery, and so there are no locking
3543 : * considerations. But we should be just as tense as XLogFileInit to avoid
3544 : * emplacing a bogus file.
3545 : */
3546 : static void
3547 80 : XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
3548 : TimeLineID srcTLI, XLogSegNo srcsegno,
3549 : int upto)
3550 : {
3551 : char path[MAXPGPATH];
3552 : char tmppath[MAXPGPATH];
3553 : PGAlignedXLogBlock buffer;
3554 : int srcfd;
3555 : int fd;
3556 : int nbytes;
3557 :
3558 : /*
3559 : * Open the source file
3560 : */
3561 80 : XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3562 80 : srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
3563 80 : if (srcfd < 0)
3564 0 : ereport(ERROR,
3565 : (errcode_for_file_access(),
3566 : errmsg("could not open file \"%s\": %m", path)));
3567 :
3568 : /*
3569 : * Copy into a temp file name.
3570 : */
3571 80 : snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3572 :
3573 80 : unlink(tmppath);
3574 :
3575 : /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3576 80 : fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3577 80 : if (fd < 0)
3578 0 : ereport(ERROR,
3579 : (errcode_for_file_access(),
3580 : errmsg("could not create file \"%s\": %m", tmppath)));
3581 :
3582 : /*
3583 : * Do the data copying.
3584 : */
3585 163920 : for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3586 : {
3587 : int nread;
3588 :
3589 163840 : nread = upto - nbytes;
3590 :
3591 : /*
3592 : * The part that is not read from the source file is filled with
3593 : * zeros.
3594 : */
3595 163840 : if (nread < sizeof(buffer))
3596 80 : memset(buffer.data, 0, sizeof(buffer));
3597 :
3598 163840 : if (nread > 0)
3599 : {
3600 : int r;
3601 :
3602 5386 : if (nread > sizeof(buffer))
3603 5306 : nread = sizeof(buffer);
3604 5386 : pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3605 5386 : r = read(srcfd, buffer.data, nread);
3606 5386 : if (r != nread)
3607 : {
3608 0 : if (r < 0)
3609 0 : ereport(ERROR,
3610 : (errcode_for_file_access(),
3611 : errmsg("could not read file \"%s\": %m",
3612 : path)));
3613 : else
3614 0 : ereport(ERROR,
3615 : (errcode(ERRCODE_DATA_CORRUPTED),
3616 : errmsg("could not read file \"%s\": read %d of %zu",
3617 : path, r, (Size) nread)));
3618 : }
3619 5386 : pgstat_report_wait_end();
3620 : }
3621 163840 : errno = 0;
3622 163840 : pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3623 163840 : if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3624 : {
3625 0 : int save_errno = errno;
3626 :
3627 : /*
3628 : * If we fail to make the file, delete it to release disk space
3629 : */
3630 0 : unlink(tmppath);
3631 : /* if write didn't set errno, assume problem is no disk space */
3632 0 : errno = save_errno ? save_errno : ENOSPC;
3633 :
3634 0 : ereport(ERROR,
3635 : (errcode_for_file_access(),
3636 : errmsg("could not write to file \"%s\": %m", tmppath)));
3637 : }
3638 163840 : pgstat_report_wait_end();
3639 : }
3640 :
3641 80 : pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3642 80 : if (pg_fsync(fd) != 0)
3643 0 : ereport(data_sync_elevel(ERROR),
3644 : (errcode_for_file_access(),
3645 : errmsg("could not fsync file \"%s\": %m", tmppath)));
3646 80 : pgstat_report_wait_end();
3647 :
3648 80 : if (CloseTransientFile(fd) != 0)
3649 0 : ereport(ERROR,
3650 : (errcode_for_file_access(),
3651 : errmsg("could not close file \"%s\": %m", tmppath)));
3652 :
3653 80 : if (CloseTransientFile(srcfd) != 0)
3654 0 : ereport(ERROR,
3655 : (errcode_for_file_access(),
3656 : errmsg("could not close file \"%s\": %m", path)));
3657 :
3658 : /*
3659 : * Now move the segment into place with its final name.
3660 : */
3661 80 : if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
3662 0 : elog(ERROR, "InstallXLogFileSegment should not have failed");
3663 80 : }
3664 :
3665 : /*
3666 : * Install a new XLOG segment file as a current or future log segment.
3667 : *
3668 : * This is used both to install a newly-created segment (which has a temp
3669 : * filename while it's being created) and to recycle an old segment.
3670 : *
3671 : * *segno: identify segment to install as (or first possible target).
3672 : * When find_free is true, this is modified on return to indicate the
3673 : * actual installation location or last segment searched.
3674 : *
3675 : * tmppath: initial name of file to install. It will be renamed into place.
3676 : *
3677 : * find_free: if true, install the new segment at the first empty segno
3678 : * number at or after the passed numbers. If false, install the new segment
3679 : * exactly where specified, deleting any existing segment file there.
3680 : *
3681 : * max_segno: maximum segment number to install the new file as. Fail if no
3682 : * free slot is found between *segno and max_segno. (Ignored when find_free
3683 : * is false.)
3684 : *
3685 : * tli: The timeline on which the new segment should be installed.
3686 : *
3687 : * Returns true if the file was installed successfully. false indicates that
3688 : * max_segno limit was exceeded, the startup process has disabled this
3689 : * function for now, or an error occurred while renaming the file into place.
3690 : */
3691 : static bool
3692 5750 : InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3693 : bool find_free, XLogSegNo max_segno, TimeLineID tli)
3694 : {
3695 : char path[MAXPGPATH];
3696 : struct stat stat_buf;
3697 :
3698 : Assert(tli != 0);
3699 :
3700 5750 : XLogFilePath(path, tli, *segno, wal_segment_size);
3701 :
3702 5750 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3703 5750 : if (!XLogCtl->InstallXLogFileSegmentActive)
3704 : {
3705 0 : LWLockRelease(ControlFileLock);
3706 0 : return false;
3707 : }
3708 :
3709 5750 : if (!find_free)
3710 : {
3711 : /* Force installation: get rid of any pre-existing segment file */
3712 80 : durable_unlink(path, DEBUG1);
3713 : }
3714 : else
3715 : {
3716 : /* Find a free slot to put it in */
3717 8198 : while (stat(path, &stat_buf) == 0)
3718 : {
3719 2626 : if ((*segno) >= max_segno)
3720 : {
3721 : /* Failed to find a free slot within specified range */
3722 98 : LWLockRelease(ControlFileLock);
3723 98 : return false;
3724 : }
3725 2528 : (*segno)++;
3726 2528 : XLogFilePath(path, tli, *segno, wal_segment_size);
3727 : }
3728 : }
3729 :
3730 : Assert(access(path, F_OK) != 0 && errno == ENOENT);
3731 5652 : if (durable_rename(tmppath, path, LOG) != 0)
3732 : {
3733 0 : LWLockRelease(ControlFileLock);
3734 : /* durable_rename already emitted log message */
3735 0 : return false;
3736 : }
3737 :
3738 5652 : LWLockRelease(ControlFileLock);
3739 :
3740 5652 : return true;
3741 : }
3742 :
3743 : /*
3744 : * Open a pre-existing logfile segment for writing.
3745 : */
3746 : int
3747 64 : XLogFileOpen(XLogSegNo segno, TimeLineID tli)
3748 : {
3749 : char path[MAXPGPATH];
3750 : int fd;
3751 :
3752 64 : XLogFilePath(path, tli, segno, wal_segment_size);
3753 :
3754 64 : fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
3755 64 : get_sync_bit(wal_sync_method));
3756 64 : if (fd < 0)
3757 0 : ereport(PANIC,
3758 : (errcode_for_file_access(),
3759 : errmsg("could not open file \"%s\": %m", path)));
3760 :
3761 64 : return fd;
3762 : }
3763 :
3764 : /*
3765 : * Close the current logfile segment for writing.
3766 : */
3767 : static void
3768 12216 : XLogFileClose(void)
3769 : {
3770 : Assert(openLogFile >= 0);
3771 :
3772 : /*
3773 : * WAL segment files will not be re-read in normal operation, so we advise
3774 : * the OS to release any cached pages. But do not do so if WAL archiving
3775 : * or streaming is active, because archiver and walsender process could
3776 : * use the cache to read the WAL segment.
3777 : */
3778 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3779 12216 : if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
3780 2778 : (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3781 : #endif
3782 :
3783 12216 : if (close(openLogFile) != 0)
3784 : {
3785 : char xlogfname[MAXFNAMELEN];
3786 0 : int save_errno = errno;
3787 :
3788 0 : XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
3789 0 : errno = save_errno;
3790 0 : ereport(PANIC,
3791 : (errcode_for_file_access(),
3792 : errmsg("could not close file \"%s\": %m", xlogfname)));
3793 : }
3794 :
3795 12216 : openLogFile = -1;
3796 12216 : ReleaseExternalFD();
3797 12216 : }
3798 :
3799 : /*
3800 : * Preallocate log files beyond the specified log endpoint.
3801 : *
3802 : * XXX this is currently extremely conservative, since it forces only one
3803 : * future log segment to exist, and even that only if we are 75% done with
3804 : * the current one. This is only appropriate for very low-WAL-volume systems.
3805 : * High-volume systems will be OK once they've built up a sufficient set of
3806 : * recycled log segments, but the startup transient is likely to include
3807 : * a lot of segment creations by foreground processes, which is not so good.
3808 : *
3809 : * XLogFileInitInternal() can ereport(ERROR). All known causes indicate big
3810 : * trouble; for example, a full filesystem is one cause. The checkpoint WAL
3811 : * and/or ControlFile updates already completed. If a RequestCheckpoint()
3812 : * initiated the present checkpoint and an ERROR ends this function, the
3813 : * command that called RequestCheckpoint() fails. That's not ideal, but it's
3814 : * not worth contorting more functions to use caller-specified elevel values.
3815 : * (With or without RequestCheckpoint(), an ERROR forestalls some inessential
3816 : * reporting and resource reclamation.)
3817 : */
3818 : static void
3819 3892 : PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
3820 : {
3821 : XLogSegNo _logSegNo;
3822 : int lf;
3823 : bool added;
3824 : char path[MAXPGPATH];
3825 : uint64 offset;
3826 :
3827 3892 : if (!XLogCtl->InstallXLogFileSegmentActive)
3828 20 : return; /* unlocked check says no */
3829 :
3830 3872 : XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3831 3872 : offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3832 3872 : if (offset >= (uint32) (0.75 * wal_segment_size))
3833 : {
3834 474 : _logSegNo++;
3835 474 : lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
3836 474 : if (lf >= 0)
3837 214 : close(lf);
3838 474 : if (added)
3839 260 : CheckpointStats.ckpt_segs_added++;
3840 : }
3841 : }
3842 :
3843 : /*
3844 : * Throws an error if the given log segment has already been removed or
3845 : * recycled. The caller should only pass a segment that it knows to have
3846 : * existed while the server has been running, as this function always
3847 : * succeeds if no WAL segments have been removed since startup.
3848 : * 'tli' is only used in the error message.
3849 : *
3850 : * Note: this function guarantees to keep errno unchanged on return.
3851 : * This supports callers that use this to possibly deliver a better
3852 : * error message about a missing file, while still being able to throw
3853 : * a normal file-access error afterwards, if this does return.
3854 : */
3855 : void
3856 242002 : CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3857 : {
3858 242002 : int save_errno = errno;
3859 : XLogSegNo lastRemovedSegNo;
3860 :
3861 242002 : SpinLockAcquire(&XLogCtl->info_lck);
3862 242002 : lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3863 242002 : SpinLockRelease(&XLogCtl->info_lck);
3864 :
3865 242002 : if (segno <= lastRemovedSegNo)
3866 : {
3867 : char filename[MAXFNAMELEN];
3868 :
3869 0 : XLogFileName(filename, tli, segno, wal_segment_size);
3870 0 : errno = save_errno;
3871 0 : ereport(ERROR,
3872 : (errcode_for_file_access(),
3873 : errmsg("requested WAL segment %s has already been removed",
3874 : filename)));
3875 : }
3876 242002 : errno = save_errno;
3877 242002 : }
3878 :
3879 : /*
3880 : * Return the last WAL segment removed, or 0 if no segment has been removed
3881 : * since startup.
3882 : *
3883 : * NB: the result can be out of date arbitrarily fast, the caller has to deal
3884 : * with that.
3885 : */
3886 : XLogSegNo
3887 2106 : XLogGetLastRemovedSegno(void)
3888 : {
3889 : XLogSegNo lastRemovedSegNo;
3890 :
3891 2106 : SpinLockAcquire(&XLogCtl->info_lck);
3892 2106 : lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3893 2106 : SpinLockRelease(&XLogCtl->info_lck);
3894 :
3895 2106 : return lastRemovedSegNo;
3896 : }
3897 :
3898 : /*
3899 : * Return the oldest WAL segment on the given TLI that still exists in
3900 : * XLOGDIR, or 0 if none.
3901 : */
3902 : XLogSegNo
3903 10 : XLogGetOldestSegno(TimeLineID tli)
3904 : {
3905 : DIR *xldir;
3906 : struct dirent *xlde;
3907 10 : XLogSegNo oldest_segno = 0;
3908 :
3909 10 : xldir = AllocateDir(XLOGDIR);
3910 68 : while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3911 : {
3912 : TimeLineID file_tli;
3913 : XLogSegNo file_segno;
3914 :
3915 : /* Ignore files that are not XLOG segments. */
3916 58 : if (!IsXLogFileName(xlde->d_name))
3917 40 : continue;
3918 :
3919 : /* Parse filename to get TLI and segno. */
3920 18 : XLogFromFileName(xlde->d_name, &file_tli, &file_segno,
3921 : wal_segment_size);
3922 :
3923 : /* Ignore anything that's not from the TLI of interest. */
3924 18 : if (tli != file_tli)
3925 0 : continue;
3926 :
3927 : /* If it's the oldest so far, update oldest_segno. */
3928 18 : if (oldest_segno == 0 || file_segno < oldest_segno)
3929 16 : oldest_segno = file_segno;
3930 : }
3931 :
3932 10 : FreeDir(xldir);
3933 10 : return oldest_segno;
3934 : }
3935 :
3936 : /*
3937 : * Update the last removed segno pointer in shared memory, to reflect that the
3938 : * given XLOG file has been removed.
3939 : */
3940 : static void
3941 5056 : UpdateLastRemovedPtr(char *filename)
3942 : {
3943 : uint32 tli;
3944 : XLogSegNo segno;
3945 :
3946 5056 : XLogFromFileName(filename, &tli, &segno, wal_segment_size);
3947 :
3948 5056 : SpinLockAcquire(&XLogCtl->info_lck);
3949 5056 : if (segno > XLogCtl->lastRemovedSegNo)
3950 2104 : XLogCtl->lastRemovedSegNo = segno;
3951 5056 : SpinLockRelease(&XLogCtl->info_lck);
3952 5056 : }
3953 :
3954 : /*
3955 : * Remove all temporary log files in pg_wal
3956 : *
3957 : * This is called at the beginning of recovery after a previous crash,
3958 : * at a point where no other processes write fresh WAL data.
3959 : */
3960 : static void
3961 350 : RemoveTempXlogFiles(void)
3962 : {
3963 : DIR *xldir;
3964 : struct dirent *xlde;
3965 :
3966 350 : elog(DEBUG2, "removing all temporary WAL segments");
3967 :
3968 350 : xldir = AllocateDir(XLOGDIR);
3969 2382 : while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3970 : {
3971 : char path[MAXPGPATH];
3972 :
3973 2032 : if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
3974 2032 : continue;
3975 :
3976 0 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3977 0 : unlink(path);
3978 0 : elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
3979 : }
3980 350 : FreeDir(xldir);
3981 350 : }
3982 :
3983 : /*
3984 : * Recycle or remove all log files older or equal to passed segno.
3985 : *
3986 : * endptr is current (or recent) end of xlog, and lastredoptr is the
3987 : * redo pointer of the last checkpoint. These are used to determine
3988 : * whether we want to recycle rather than delete no-longer-wanted log files.
3989 : *
3990 : * insertTLI is the current timeline for XLOG insertion. Any recycled
3991 : * segments should be reused for this timeline.
3992 : */
3993 : static void
3994 3362 : RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
3995 : TimeLineID insertTLI)
3996 : {
3997 : DIR *xldir;
3998 : struct dirent *xlde;
3999 : char lastoff[MAXFNAMELEN];
4000 : XLogSegNo endlogSegNo;
4001 : XLogSegNo recycleSegNo;
4002 :
4003 : /* Initialize info about where to try to recycle to */
4004 3362 : XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
4005 3362 : recycleSegNo = XLOGfileslop(lastredoptr);
4006 :
4007 : /*
4008 : * Construct a filename of the last segment to be kept. The timeline ID
4009 : * doesn't matter, we ignore that in the comparison. (During recovery,
4010 : * InsertTimeLineID isn't set, so we can't use that.)
4011 : */
4012 3362 : XLogFileName(lastoff, 0, segno, wal_segment_size);
4013 :
4014 3362 : elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4015 : lastoff);
4016 :
4017 3362 : xldir = AllocateDir(XLOGDIR);
4018 :
4019 81276 : while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4020 : {
4021 : /* Ignore files that are not XLOG segments */
4022 77914 : if (!IsXLogFileName(xlde->d_name) &&
4023 14528 : !IsPartialXLogFileName(xlde->d_name))
4024 14520 : continue;
4025 :
4026 : /*
4027 : * We ignore the timeline part of the XLOG segment identifiers in
4028 : * deciding whether a segment is still needed. This ensures that we
4029 : * won't prematurely remove a segment from a parent timeline. We could
4030 : * probably be a little more proactive about removing segments of
4031 : * non-parent timelines, but that would be a whole lot more
4032 : * complicated.
4033 : *
4034 : * We use the alphanumeric sorting property of the filenames to decide
4035 : * which ones are earlier than the lastoff segment.
4036 : */
4037 63394 : if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4038 : {
4039 40408 : if (XLogArchiveCheckDone(xlde->d_name))
4040 : {
4041 : /* Update the last removed location in shared memory first */
4042 5056 : UpdateLastRemovedPtr(xlde->d_name);
4043 :
4044 5056 : RemoveXlogFile(xlde, recycleSegNo, &endlogSegNo, insertTLI);
4045 : }
4046 : }
4047 : }
4048 :
4049 3362 : FreeDir(xldir);
4050 3362 : }
4051 :
4052 : /*
4053 : * Recycle or remove WAL files that are not part of the given timeline's
4054 : * history.
4055 : *
4056 : * This is called during recovery, whenever we switch to follow a new
4057 : * timeline, and at the end of recovery when we create a new timeline. We
4058 : * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
4059 : * might be leftover pre-allocated or recycled WAL segments on the old timeline
4060 : * that we haven't used yet, and contain garbage. If we just leave them in
4061 : * pg_wal, they will eventually be archived, and we can't let that happen.
4062 : * Files that belong to our timeline history are valid, because we have
4063 : * successfully replayed them, but from others we can't be sure.
4064 : *
4065 : * 'switchpoint' is the current point in WAL where we switch to new timeline,
4066 : * and 'newTLI' is the new timeline we switch to.
4067 : */
4068 : void
4069 120 : RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
4070 : {
4071 : DIR *xldir;
4072 : struct dirent *xlde;
4073 : char switchseg[MAXFNAMELEN];
4074 : XLogSegNo endLogSegNo;
4075 : XLogSegNo switchLogSegNo;
4076 : XLogSegNo recycleSegNo;
4077 :
4078 : /*
4079 : * Initialize info about where to begin the work. This will recycle,
4080 : * somewhat arbitrarily, 10 future segments.
4081 : */
4082 120 : XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
4083 120 : XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
4084 120 : recycleSegNo = endLogSegNo + 10;
4085 :
4086 : /*
4087 : * Construct a filename of the last segment to be kept.
4088 : */
4089 120 : XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
4090 :
4091 120 : elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
4092 : switchseg);
4093 :
4094 120 : xldir = AllocateDir(XLOGDIR);
4095 :
4096 1146 : while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4097 : {
4098 : /* Ignore files that are not XLOG segments */
4099 1026 : if (!IsXLogFileName(xlde->d_name))
4100 636 : continue;
4101 :
4102 : /*
4103 : * Remove files that are on a timeline older than the new one we're
4104 : * switching to, but with a segment number >= the first segment on the
4105 : * new timeline.
4106 : */
4107 390 : if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
4108 254 : strcmp(xlde->d_name + 8, switchseg + 8) > 0)
4109 : {
4110 : /*
4111 : * If the file has already been marked as .ready, however, don't
4112 : * remove it yet. It should be OK to remove it - files that are
4113 : * not part of our timeline history are not required for recovery
4114 : * - but seems safer to let them be archived and removed later.
4115 : */
4116 34 : if (!XLogArchiveIsReady(xlde->d_name))
4117 34 : RemoveXlogFile(xlde, recycleSegNo, &endLogSegNo, newTLI);
4118 : }
4119 : }
4120 :
4121 120 : FreeDir(xldir);
4122 120 : }
4123 :
4124 : /*
4125 : * Recycle or remove a log file that's no longer needed.
4126 : *
4127 : * segment_de is the dirent structure of the segment to recycle or remove.
4128 : * recycleSegNo is the segment number to recycle up to. endlogSegNo is
4129 : * the segment number of the current (or recent) end of WAL.
4130 : *
4131 : * endlogSegNo gets incremented if the segment is recycled so as it is not
4132 : * checked again with future callers of this function.
4133 : *
4134 : * insertTLI is the current timeline for XLOG insertion. Any recycled segments
4135 : * should be used for this timeline.
4136 : */
4137 : static void
4138 5090 : RemoveXlogFile(const struct dirent *segment_de,
4139 : XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
4140 : TimeLineID insertTLI)
4141 : {
4142 : char path[MAXPGPATH];
4143 : #ifdef WIN32
4144 : char newpath[MAXPGPATH];
4145 : #endif
4146 5090 : const char *segname = segment_de->d_name;
4147 :
4148 5090 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4149 :
4150 : /*
4151 : * Before deleting the file, see if it can be recycled as a future log
4152 : * segment. Only recycle normal files, because we don't want to recycle
4153 : * symbolic links pointing to a separate archive directory.
4154 : */
4155 5090 : if (wal_recycle &&
4156 5090 : *endlogSegNo <= recycleSegNo &&
4157 6080 : XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */
4158 5428 : get_dirent_type(path, segment_de, false, DEBUG2) == PGFILETYPE_REG &&
4159 2714 : InstallXLogFileSegment(endlogSegNo, path,
4160 : true, recycleSegNo, insertTLI))
4161 : {
4162 2616 : ereport(DEBUG2,
4163 : (errmsg_internal("recycled write-ahead log file \"%s\"",
4164 : segname)));
4165 2616 : CheckpointStats.ckpt_segs_recycled++;
4166 : /* Needn't recheck that slot on future iterations */
4167 2616 : (*endlogSegNo)++;
4168 : }
4169 : else
4170 : {
4171 : /* No need for any more future segments, or recycling failed ... */
4172 : int rc;
4173 :
4174 2474 : ereport(DEBUG2,
4175 : (errmsg_internal("removing write-ahead log file \"%s\"",
4176 : segname)));
4177 :
4178 : #ifdef WIN32
4179 :
4180 : /*
4181 : * On Windows, if another process (e.g another backend) holds the file
4182 : * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4183 : * will still show up in directory listing until the last handle is
4184 : * closed. To avoid confusing the lingering deleted file for a live
4185 : * WAL file that needs to be archived, rename it before deleting it.
4186 : *
4187 : * If another process holds the file open without FILE_SHARE_DELETE
4188 : * flag, rename will fail. We'll try again at the next checkpoint.
4189 : */
4190 : snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4191 : if (rename(path, newpath) != 0)
4192 : {
4193 : ereport(LOG,
4194 : (errcode_for_file_access(),
4195 : errmsg("could not rename file \"%s\": %m",
4196 : path)));
4197 : return;
4198 : }
4199 : rc = durable_unlink(newpath, LOG);
4200 : #else
4201 2474 : rc = durable_unlink(path, LOG);
4202 : #endif
4203 2474 : if (rc != 0)
4204 : {
4205 : /* Message already logged by durable_unlink() */
4206 0 : return;
4207 : }
4208 2474 : CheckpointStats.ckpt_segs_removed++;
4209 : }
4210 :
4211 5090 : XLogArchiveCleanup(segname);
4212 : }
4213 :
4214 : /*
4215 : * Verify whether pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
4216 : * If the latter do not exist, recreate them.
4217 : *
4218 : * It is not the goal of this function to verify the contents of these
4219 : * directories, but to help in cases where someone has performed a cluster
4220 : * copy for PITR purposes but omitted pg_wal from the copy.
4221 : *
4222 : * We could also recreate pg_wal if it doesn't exist, but a deliberate
4223 : * policy decision was made not to. It is fairly common for pg_wal to be
4224 : * a symlink, and if that was the DBA's intent then automatically making a
4225 : * plain directory would result in degraded performance with no notice.
4226 : */
4227 : static void
4228 1844 : ValidateXLOGDirectoryStructure(void)
4229 : {
4230 : char path[MAXPGPATH];
4231 : struct stat stat_buf;
4232 :
4233 : /* Check for pg_wal; if it doesn't exist, error out */
4234 1844 : if (stat(XLOGDIR, &stat_buf) != 0 ||
4235 1844 : !S_ISDIR(stat_buf.st_mode))
4236 0 : ereport(FATAL,
4237 : (errcode_for_file_access(),
4238 : errmsg("required WAL directory \"%s\" does not exist",
4239 : XLOGDIR)));
4240 :
4241 : /* Check for archive_status */
4242 1844 : snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4243 1844 : if (stat(path, &stat_buf) == 0)
4244 : {
4245 : /* Check for weird cases where it exists but isn't a directory */
4246 1842 : if (!S_ISDIR(stat_buf.st_mode))
4247 0 : ereport(FATAL,
4248 : (errcode_for_file_access(),
4249 : errmsg("required WAL directory \"%s\" does not exist",
4250 : path)));
4251 : }
4252 : else
4253 : {
4254 2 : ereport(LOG,
4255 : (errmsg("creating missing WAL directory \"%s\"", path)));
4256 2 : if (MakePGDirectory(path) < 0)
4257 0 : ereport(FATAL,
4258 : (errcode_for_file_access(),
4259 : errmsg("could not create missing directory \"%s\": %m",
4260 : path)));
4261 : }
4262 :
4263 : /* Check for summaries */
4264 1844 : snprintf(path, MAXPGPATH, XLOGDIR "/summaries");
4265 1844 : if (stat(path, &stat_buf) == 0)
4266 : {
4267 : /* Check for weird cases where it exists but isn't a directory */
4268 1842 : if (!S_ISDIR(stat_buf.st_mode))
4269 0 : ereport(FATAL,
4270 : (errmsg("required WAL directory \"%s\" does not exist",
4271 : path)));
4272 : }
4273 : else
4274 : {
4275 2 : ereport(LOG,
4276 : (errmsg("creating missing WAL directory \"%s\"", path)));
4277 2 : if (MakePGDirectory(path) < 0)
4278 0 : ereport(FATAL,
4279 : (errmsg("could not create missing directory \"%s\": %m",
4280 : path)));
4281 : }
4282 1844 : }
4283 :
4284 : /*
4285 : * Remove previous backup history files. This also retries creation of
4286 : * .ready files for any backup history files for which XLogArchiveNotify
4287 : * failed earlier.
4288 : */
4289 : static void
4290 294 : CleanupBackupHistory(void)
4291 : {
4292 : DIR *xldir;
4293 : struct dirent *xlde;
4294 : char path[MAXPGPATH + sizeof(XLOGDIR)];
4295 :
4296 294 : xldir = AllocateDir(XLOGDIR);
4297 :
4298 2956 : while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4299 : {
4300 2368 : if (IsBackupHistoryFileName(xlde->d_name))
4301 : {
4302 310 : if (XLogArchiveCheckDone(xlde->d_name))
4303 : {
4304 244 : elog(DEBUG2, "removing WAL backup history file \"%s\"",
4305 : xlde->d_name);
4306 244 : snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4307 244 : unlink(path);
4308 244 : XLogArchiveCleanup(xlde->d_name);
4309 : }
4310 : }
4311 : }
4312 :
4313 294 : FreeDir(xldir);
4314 294 : }
4315 :
4316 : /*
4317 : * I/O routines for pg_control
4318 : *
4319 : * *ControlFile is a buffer in shared memory that holds an image of the
4320 : * contents of pg_control. WriteControlFile() initializes pg_control
4321 : * given a preloaded buffer, ReadControlFile() loads the buffer from
4322 : * the pg_control file (during postmaster or standalone-backend startup),
4323 : * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4324 : * InitControlFile() fills the buffer with initial values.
4325 : *
4326 : * For simplicity, WriteControlFile() initializes the fields of pg_control
4327 : * that are related to checking backend/database compatibility, and
4328 : * ReadControlFile() verifies they are correct. We could split out the
4329 : * I/O and compatibility-check functions, but there seems no need currently.
4330 : */
4331 :
4332 : static void
4333 100 : InitControlFile(uint64 sysidentifier, uint32 data_checksum_version)
4334 : {
4335 : char mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
4336 :
4337 : /*
4338 : * Generate a random nonce. This is used for authentication requests that
4339 : * will fail because the user does not exist. The nonce is used to create
4340 : * a genuine-looking password challenge for the non-existent user, in lieu
4341 : * of an actual stored password.
4342 : */
4343 100 : if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
4344 0 : ereport(PANIC,
4345 : (errcode(ERRCODE_INTERNAL_ERROR),
4346 : errmsg("could not generate secret authorization token")));
4347 :
4348 100 : memset(ControlFile, 0, sizeof(ControlFileData));
4349 : /* Initialize pg_control status fields */
4350 100 : ControlFile->system_identifier = sysidentifier;
4351 100 : memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
4352 100 : ControlFile->state = DB_SHUTDOWNED;
4353 100 : ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
4354 :
4355 : /* Set important parameter values for use when replaying WAL */
4356 100 : ControlFile->MaxConnections = MaxConnections;
4357 100 : ControlFile->max_worker_processes = max_worker_processes;
4358 100 : ControlFile->max_wal_senders = max_wal_senders;
4359 100 : ControlFile->max_prepared_xacts = max_prepared_xacts;
4360 100 : ControlFile->max_locks_per_xact = max_locks_per_xact;
4361 100 : ControlFile->wal_level = wal_level;
4362 100 : ControlFile->wal_log_hints = wal_log_hints;
4363 100 : ControlFile->track_commit_timestamp = track_commit_timestamp;
4364 100 : ControlFile->data_checksum_version = data_checksum_version;
4365 100 : }
4366 :
4367 : static void
4368 100 : WriteControlFile(void)
4369 : {
4370 : int fd;
4371 : char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */
4372 :
4373 : /*
4374 : * Initialize version and compatibility-check fields
4375 : */
4376 100 : ControlFile->pg_control_version = PG_CONTROL_VERSION;
4377 100 : ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4378 :
4379 100 : ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4380 100 : ControlFile->floatFormat = FLOATFORMAT_VALUE;
4381 :
4382 100 : ControlFile->blcksz = BLCKSZ;
4383 100 : ControlFile->relseg_size = RELSEG_SIZE;
4384 100 : ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4385 100 : ControlFile->xlog_seg_size = wal_segment_size;
4386 :
4387 100 : ControlFile->nameDataLen = NAMEDATALEN;
4388 100 : ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4389 :
4390 100 : ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4391 100 : ControlFile->loblksize = LOBLKSIZE;
4392 :
4393 100 : ControlFile->float8ByVal = true; /* vestigial */
4394 :
4395 : /*
4396 : * Initialize the default 'char' signedness.
4397 : *
4398 : * The signedness of the char type is implementation-defined. For instance
4399 : * on x86 architecture CPUs, the char data type is typically treated as
4400 : * signed by default, whereas on aarch architecture CPUs, it is typically
4401 : * treated as unsigned by default. In v17 or earlier, we accidentally let
4402 : * C implementation signedness affect persistent data. This led to
4403 : * inconsistent results when comparing char data across different
4404 : * platforms.
4405 : *
4406 : * This flag can be used as a hint to ensure consistent behavior for
4407 : * pre-v18 data files that store data sorted by the 'char' type on disk,
4408 : * especially in cross-platform replication scenarios.
4409 : *
4410 : * Newly created database clusters unconditionally set the default char
4411 : * signedness to true. pg_upgrade changes this flag for clusters that were
4412 : * initialized on signedness=false platforms. As a result,
4413 : * signedness=false setting will become rare over time. If we had known
4414 : * about this problem during the last development cycle that forced initdb
4415 : * (v8.3), we would have made all clusters signed or all clusters
4416 : * unsigned. Making pg_upgrade the only source of signedness=false will
4417 : * cause the population of database clusters to converge toward that
4418 : * retrospective ideal.
4419 : */
4420 100 : ControlFile->default_char_signedness = true;
4421 :
4422 : /* Contents are protected with a CRC */
4423 100 : INIT_CRC32C(ControlFile->crc);
4424 100 : COMP_CRC32C(ControlFile->crc,
4425 : ControlFile,
4426 : offsetof(ControlFileData, crc));
4427 100 : FIN_CRC32C(ControlFile->crc);
4428 :
4429 : /*
4430 : * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4431 : * the excess over sizeof(ControlFileData). This reduces the odds of
4432 : * premature-EOF errors when reading pg_control. We'll still fail when we
4433 : * check the contents of the file, but hopefully with a more specific
4434 : * error than "couldn't read pg_control".
4435 : */
4436 100 : memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4437 100 : memcpy(buffer, ControlFile, sizeof(ControlFileData));
4438 :
4439 100 : fd = BasicOpenFile(XLOG_CONTROL_FILE,
4440 : O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
4441 100 : if (fd < 0)
4442 0 : ereport(PANIC,
4443 : (errcode_for_file_access(),
4444 : errmsg("could not create file \"%s\": %m",
4445 : XLOG_CONTROL_FILE)));
4446 :
4447 100 : errno = 0;
4448 100 : pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4449 100 : if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4450 : {
4451 : /* if write didn't set errno, assume problem is no disk space */
4452 0 : if (errno == 0)
4453 0 : errno = ENOSPC;
4454 0 : ereport(PANIC,
4455 : (errcode_for_file_access(),
4456 : errmsg("could not write to file \"%s\": %m",
4457 : XLOG_CONTROL_FILE)));
4458 : }
4459 100 : pgstat_report_wait_end();
4460 :
4461 100 : pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4462 100 : if (pg_fsync(fd) != 0)
4463 0 : ereport(PANIC,
4464 : (errcode_for_file_access(),
4465 : errmsg("could not fsync file \"%s\": %m",
4466 : XLOG_CONTROL_FILE)));
4467 100 : pgstat_report_wait_end();
4468 :
4469 100 : if (close(fd) != 0)
4470 0 : ereport(PANIC,
4471 : (errcode_for_file_access(),
4472 : errmsg("could not close file \"%s\": %m",
4473 : XLOG_CONTROL_FILE)));
4474 100 : }
4475 :
4476 : static void
4477 1944 : ReadControlFile(void)
4478 : {
4479 : pg_crc32c crc;
4480 : int fd;
4481 : char wal_segsz_str[20];
4482 : int r;
4483 :
4484 : /*
4485 : * Read data...
4486 : */
4487 1944 : fd = BasicOpenFile(XLOG_CONTROL_FILE,
4488 : O_RDWR | PG_BINARY);
4489 1944 : if (fd < 0)
4490 0 : ereport(PANIC,
4491 : (errcode_for_file_access(),
4492 : errmsg("could not open file \"%s\": %m",
4493 : XLOG_CONTROL_FILE)));
4494 :
4495 1944 : pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4496 1944 : r = read(fd, ControlFile, sizeof(ControlFileData));
4497 1944 : if (r != sizeof(ControlFileData))
4498 : {
4499 0 : if (r < 0)
4500 0 : ereport(PANIC,
4501 : (errcode_for_file_access(),
4502 : errmsg("could not read file \"%s\": %m",
4503 : XLOG_CONTROL_FILE)));
4504 : else
4505 0 : ereport(PANIC,
4506 : (errcode(ERRCODE_DATA_CORRUPTED),
4507 : errmsg("could not read file \"%s\": read %d of %zu",
4508 : XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4509 : }
4510 1944 : pgstat_report_wait_end();
4511 :
4512 1944 : close(fd);
4513 :
4514 : /*
4515 : * Check for expected pg_control format version. If this is wrong, the
4516 : * CRC check will likely fail because we'll be checking the wrong number
4517 : * of bytes. Complaining about wrong version will probably be more
4518 : * enlightening than complaining about wrong CRC.
4519 : */
4520 :
4521 1944 : if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4522 0 : ereport(FATAL,
4523 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4524 : errmsg("database files are incompatible with server"),
4525 : errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4526 : " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4527 : ControlFile->pg_control_version, ControlFile->pg_control_version,
4528 : PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4529 : errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
4530 :
4531 1944 : if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4532 0 : ereport(FATAL,
4533 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4534 : errmsg("database files are incompatible with server"),
4535 : errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4536 : " but the server was compiled with PG_CONTROL_VERSION %d.",
4537 : ControlFile->pg_control_version, PG_CONTROL_VERSION),
4538 : errhint("It looks like you need to initdb.")));
4539 :
4540 : /* Now check the CRC. */
4541 1944 : INIT_CRC32C(crc);
4542 1944 : COMP_CRC32C(crc,
4543 : ControlFile,
4544 : offsetof(ControlFileData, crc));
4545 1944 : FIN_CRC32C(crc);
4546 :
4547 1944 : if (!EQ_CRC32C(crc, ControlFile->crc))
4548 0 : ereport(FATAL,
4549 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4550 : errmsg("incorrect checksum in control file")));
4551 :
4552 : /*
4553 : * Do compatibility checking immediately. If the database isn't
4554 : * compatible with the backend executable, we want to abort before we can
4555 : * possibly do any damage.
4556 : */
4557 1944 : if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4558 0 : ereport(FATAL,
4559 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4560 : errmsg("database files are incompatible with server"),
4561 : /* translator: %s is a variable name and %d is its value */
4562 : errdetail("The database cluster was initialized with %s %d,"
4563 : " but the server was compiled with %s %d.",
4564 : "CATALOG_VERSION_NO", ControlFile->catalog_version_no,
4565 : "CATALOG_VERSION_NO", CATALOG_VERSION_NO),
4566 : errhint("It looks like you need to initdb.")));
4567 1944 : if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4568 0 : ereport(FATAL,
4569 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4570 : errmsg("database files are incompatible with server"),
4571 : /* translator: %s is a variable name and %d is its value */
4572 : errdetail("The database cluster was initialized with %s %d,"
4573 : " but the server was compiled with %s %d.",
4574 : "MAXALIGN", ControlFile->maxAlign,
4575 : "MAXALIGN", MAXIMUM_ALIGNOF),
4576 : errhint("It looks like you need to initdb.")));
4577 1944 : if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4578 0 : ereport(FATAL,
4579 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4580 : errmsg("database files are incompatible with server"),
4581 : errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4582 : errhint("It looks like you need to initdb.")));
4583 1944 : if (ControlFile->blcksz != BLCKSZ)
4584 0 : ereport(FATAL,
4585 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4586 : errmsg("database files are incompatible with server"),
4587 : /* translator: %s is a variable name and %d is its value */
4588 : errdetail("The database cluster was initialized with %s %d,"
4589 : " but the server was compiled with %s %d.",
4590 : "BLCKSZ", ControlFile->blcksz,
4591 : "BLCKSZ", BLCKSZ),
4592 : errhint("It looks like you need to recompile or initdb.")));
4593 1944 : if (ControlFile->relseg_size != RELSEG_SIZE)
4594 0 : ereport(FATAL,
4595 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4596 : errmsg("database files are incompatible with server"),
4597 : /* translator: %s is a variable name and %d is its value */
4598 : errdetail("The database cluster was initialized with %s %d,"
4599 : " but the server was compiled with %s %d.",
4600 : "RELSEG_SIZE", ControlFile->relseg_size,
4601 : "RELSEG_SIZE", RELSEG_SIZE),
4602 : errhint("It looks like you need to recompile or initdb.")));
4603 1944 : if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4604 0 : ereport(FATAL,
4605 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4606 : errmsg("database files are incompatible with server"),
4607 : /* translator: %s is a variable name and %d is its value */
4608 : errdetail("The database cluster was initialized with %s %d,"
4609 : " but the server was compiled with %s %d.",
4610 : "XLOG_BLCKSZ", ControlFile->xlog_blcksz,
4611 : "XLOG_BLCKSZ", XLOG_BLCKSZ),
4612 : errhint("It looks like you need to recompile or initdb.")));
4613 1944 : if (ControlFile->nameDataLen != NAMEDATALEN)
4614 0 : ereport(FATAL,
4615 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4616 : errmsg("database files are incompatible with server"),
4617 : /* translator: %s is a variable name and %d is its value */
4618 : errdetail("The database cluster was initialized with %s %d,"
4619 : " but the server was compiled with %s %d.",
4620 : "NAMEDATALEN", ControlFile->nameDataLen,
4621 : "NAMEDATALEN", NAMEDATALEN),
4622 : errhint("It looks like you need to recompile or initdb.")));
4623 1944 : if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4624 0 : ereport(FATAL,
4625 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4626 : errmsg("database files are incompatible with server"),
4627 : /* translator: %s is a variable name and %d is its value */
4628 : errdetail("The database cluster was initialized with %s %d,"
4629 : " but the server was compiled with %s %d.",
4630 : "INDEX_MAX_KEYS", ControlFile->indexMaxKeys,
4631 : "INDEX_MAX_KEYS", INDEX_MAX_KEYS),
4632 : errhint("It looks like you need to recompile or initdb.")));
4633 1944 : if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4634 0 : ereport(FATAL,
4635 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4636 : errmsg("database files are incompatible with server"),
4637 : /* translator: %s is a variable name and %d is its value */
4638 : errdetail("The database cluster was initialized with %s %d,"
4639 : " but the server was compiled with %s %d.",
4640 : "TOAST_MAX_CHUNK_SIZE", ControlFile->toast_max_chunk_size,
4641 : "TOAST_MAX_CHUNK_SIZE", (int) TOAST_MAX_CHUNK_SIZE),
4642 : errhint("It looks like you need to recompile or initdb.")));
4643 1944 : if (ControlFile->loblksize != LOBLKSIZE)
4644 0 : ereport(FATAL,
4645 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4646 : errmsg("database files are incompatible with server"),
4647 : /* translator: %s is a variable name and %d is its value */
4648 : errdetail("The database cluster was initialized with %s %d,"
4649 : " but the server was compiled with %s %d.",
4650 : "LOBLKSIZE", ControlFile->loblksize,
4651 : "LOBLKSIZE", (int) LOBLKSIZE),
4652 : errhint("It looks like you need to recompile or initdb.")));
4653 :
4654 : Assert(ControlFile->float8ByVal); /* vestigial, not worth an error msg */
4655 :
4656 1944 : wal_segment_size = ControlFile->xlog_seg_size;
4657 :
4658 1944 : if (!IsValidWalSegSize(wal_segment_size))
4659 0 : ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4660 : errmsg_plural("invalid WAL segment size in control file (%d byte)",
4661 : "invalid WAL segment size in control file (%d bytes)",
4662 : wal_segment_size,
4663 : wal_segment_size),
4664 : errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.")));
4665 :
4666 1944 : snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4667 1944 : SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4668 : PGC_S_DYNAMIC_DEFAULT);
4669 :
4670 : /* check and update variables dependent on wal_segment_size */
4671 1944 : if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
4672 0 : ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4673 : /* translator: both %s are GUC names */
4674 : errmsg("\"%s\" must be at least twice \"%s\"",
4675 : "min_wal_size", "wal_segment_size")));
4676 :
4677 1944 : if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
4678 0 : ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4679 : /* translator: both %s are GUC names */
4680 : errmsg("\"%s\" must be at least twice \"%s\"",
4681 : "max_wal_size", "wal_segment_size")));
4682 :
4683 1944 : UsableBytesInSegment =
4684 1944 : (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4685 : (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4686 :
4687 1944 : CalculateCheckpointSegments();
4688 :
4689 : /* Make the initdb settings visible as GUC variables, too */
4690 1944 : SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4691 : PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
4692 1944 : }
4693 :
4694 : /*
4695 : * Utility wrapper to update the control file. Note that the control
4696 : * file gets flushed.
4697 : */
4698 : static void
4699 17888 : UpdateControlFile(void)
4700 : {
4701 17888 : update_controlfile(DataDir, ControlFile, true);
4702 17888 : }
4703 :
4704 : /*
4705 : * Returns the unique system identifier from control file.
4706 : */
4707 : uint64
4708 2734 : GetSystemIdentifier(void)
4709 : {
4710 : Assert(ControlFile != NULL);
4711 2734 : return ControlFile->system_identifier;
4712 : }
4713 :
4714 : /*
4715 : * Returns the random nonce from control file.
4716 : */
4717 : char *
4718 2 : GetMockAuthenticationNonce(void)
4719 : {
4720 : Assert(ControlFile != NULL);
4721 2 : return ControlFile->mock_authentication_nonce;
4722 : }
4723 :
4724 : /*
4725 : * Are checksums enabled for data pages?
4726 : */
4727 : bool
4728 19841554 : DataChecksumsEnabled(void)
4729 : {
4730 : Assert(ControlFile != NULL);
4731 19841554 : return (ControlFile->data_checksum_version > 0);
4732 : }
4733 :
4734 : /*
4735 : * Return true if the cluster was initialized on a platform where the
4736 : * default signedness of char is "signed". This function exists for code
4737 : * that deals with pre-v18 data files that store data sorted by the 'char'
4738 : * type on disk (e.g., GIN and GiST indexes). See the comments in
4739 : * WriteControlFile() for details.
4740 : */
4741 : bool
4742 6 : GetDefaultCharSignedness(void)
4743 : {
4744 6 : return ControlFile->default_char_signedness;
4745 : }
4746 :
4747 : /*
4748 : * Returns a fake LSN for unlogged relations.
4749 : *
4750 : * Each call generates an LSN that is greater than any previous value
4751 : * returned. The current counter value is saved and restored across clean
4752 : * shutdowns, but like unlogged relations, does not survive a crash. This can
4753 : * be used in lieu of real LSN values returned by XLogInsert, if you need an
4754 : * LSN-like increasing sequence of numbers without writing any WAL.
4755 : */
4756 : XLogRecPtr
4757 66 : GetFakeLSNForUnloggedRel(void)
4758 : {
4759 66 : return pg_atomic_fetch_add_u64(&XLogCtl->unloggedLSN, 1);
4760 : }
4761 :
4762 : /*
4763 : * Auto-tune the number of XLOG buffers.
4764 : *
4765 : * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4766 : * a maximum of one XLOG segment (there is little reason to think that more
4767 : * is helpful, at least so long as we force an fsync when switching log files)
4768 : * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4769 : * 9.1, when auto-tuning was added).
4770 : *
4771 : * This should not be called until NBuffers has received its final value.
4772 : */
4773 : static int
4774 2124 : XLOGChooseNumBuffers(void)
4775 : {
4776 : int xbuffers;
4777 :
4778 2124 : xbuffers = NBuffers / 32;
4779 2124 : if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
4780 48 : xbuffers = (wal_segment_size / XLOG_BLCKSZ);
4781 2124 : if (xbuffers < 8)
4782 818 : xbuffers = 8;
4783 2124 : return xbuffers;
4784 : }
4785 :
4786 : /*
4787 : * GUC check_hook for wal_buffers
4788 : */
4789 : bool
4790 4328 : check_wal_buffers(int *newval, void **extra, GucSource source)
4791 : {
4792 : /*
4793 : * -1 indicates a request for auto-tune.
4794 : */
4795 4328 : if (*newval == -1)
4796 : {
4797 : /*
4798 : * If we haven't yet changed the boot_val default of -1, just let it
4799 : * be. We'll fix it when XLOGShmemSize is called.
4800 : */
4801 2204 : if (XLOGbuffers == -1)
4802 2204 : return true;
4803 :
4804 : /* Otherwise, substitute the auto-tune value */
4805 0 : *newval = XLOGChooseNumBuffers();
4806 : }
4807 :
4808 : /*
4809 : * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
4810 : * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4811 : * the case, we just silently treat such values as a request for the
4812 : * minimum. (We could throw an error instead, but that doesn't seem very
4813 : * helpful.)
4814 : */
4815 2124 : if (*newval < 4)
4816 0 : *newval = 4;
4817 :
4818 2124 : return true;
4819 : }
4820 :
4821 : /*
4822 : * GUC check_hook for wal_consistency_checking
4823 : */
4824 : bool
4825 3992 : check_wal_consistency_checking(char **newval, void **extra, GucSource source)
4826 : {
4827 : char *rawstring;
4828 : List *elemlist;
4829 : ListCell *l;
4830 : bool newwalconsistency[RM_MAX_ID + 1];
4831 :
4832 : /* Initialize the array */
4833 131736 : MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool));
4834 :
4835 : /* Need a modifiable copy of string */
4836 3992 : rawstring = pstrdup(*newval);
4837 :
4838 : /* Parse string into list of identifiers */
4839 3992 : if (!SplitIdentifierString(rawstring, ',', &elemlist))
4840 : {
4841 : /* syntax error in list */
4842 0 : GUC_check_errdetail("List syntax is invalid.");
4843 0 : pfree(rawstring);
4844 0 : list_free(elemlist);
4845 0 : return false;
4846 : }
4847 :
4848 4892 : foreach(l, elemlist)
4849 : {
4850 900 : char *tok = (char *) lfirst(l);
4851 : int rmid;
4852 :
4853 : /* Check for 'all'. */
4854 900 : if (pg_strcasecmp(tok, "all") == 0)
4855 : {
4856 230272 : for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
4857 229376 : if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL)
4858 8960 : newwalconsistency[rmid] = true;
4859 : }
4860 : else
4861 : {
4862 : /* Check if the token matches any known resource manager. */
4863 4 : bool found = false;
4864 :
4865 72 : for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
4866 : {
4867 108 : if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL &&
4868 36 : pg_strcasecmp(tok, GetRmgr(rmid).rm_name) == 0)
4869 : {
4870 4 : newwalconsistency[rmid] = true;
4871 4 : found = true;
4872 4 : break;
4873 : }
4874 : }
4875 4 : if (!found)
4876 : {
4877 : /*
4878 : * During startup, it might be a not-yet-loaded custom
4879 : * resource manager. Defer checking until
4880 : * InitializeWalConsistencyChecking().
4881 : */
4882 0 : if (!process_shared_preload_libraries_done)
4883 : {
4884 0 : check_wal_consistency_checking_deferred = true;
4885 : }
4886 : else
4887 : {
4888 0 : GUC_check_errdetail("Unrecognized key word: \"%s\".", tok);
4889 0 : pfree(rawstring);
4890 0 : list_free(elemlist);
4891 0 : return false;
4892 : }
4893 : }
4894 : }
4895 : }
4896 :
4897 3992 : pfree(rawstring);
4898 3992 : list_free(elemlist);
4899 :
4900 : /* assign new value */
4901 3992 : *extra = guc_malloc(LOG, (RM_MAX_ID + 1) * sizeof(bool));
4902 3992 : if (!*extra)
4903 0 : return false;
4904 3992 : memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool));
4905 3992 : return true;
4906 : }
4907 :
4908 : /*
4909 : * GUC assign_hook for wal_consistency_checking
4910 : */
4911 : void
4912 3990 : assign_wal_consistency_checking(const char *newval, void *extra)
4913 : {
4914 : /*
4915 : * If some checks were deferred, it's possible that the checks will fail
4916 : * later during InitializeWalConsistencyChecking(). But in that case, the
4917 : * postmaster will exit anyway, so it's safe to proceed with the
4918 : * assignment.
4919 : *
4920 : * Any built-in resource managers specified are assigned immediately,
4921 : * which affects WAL created before shared_preload_libraries are
4922 : * processed. Any custom resource managers specified won't be assigned
4923 : * until after shared_preload_libraries are processed, but that's OK
4924 : * because WAL for a custom resource manager can't be written before the
4925 : * module is loaded anyway.
4926 : */
4927 3990 : wal_consistency_checking = extra;
4928 3990 : }
4929 :
4930 : /*
4931 : * InitializeWalConsistencyChecking: run after loading custom resource managers
4932 : *
4933 : * If any unknown resource managers were specified in the
4934 : * wal_consistency_checking GUC, processing was deferred. Now that
4935 : * shared_preload_libraries have been loaded, process wal_consistency_checking
4936 : * again.
4937 : */
4938 : void
4939 1824 : InitializeWalConsistencyChecking(void)
4940 : {
4941 : Assert(process_shared_preload_libraries_done);
4942 :
4943 1824 : if (check_wal_consistency_checking_deferred)
4944 : {
4945 : struct config_generic *guc;
4946 :
4947 0 : guc = find_option("wal_consistency_checking", false, false, ERROR);
4948 :
4949 0 : check_wal_consistency_checking_deferred = false;
4950 :
4951 0 : set_config_option_ext("wal_consistency_checking",
4952 : wal_consistency_checking_string,
4953 : guc->scontext, guc->source, guc->srole,
4954 : GUC_ACTION_SET, true, ERROR, false);
4955 :
4956 : /* checking should not be deferred again */
4957 : Assert(!check_wal_consistency_checking_deferred);
4958 : }
4959 1824 : }
4960 :
4961 : /*
4962 : * GUC show_hook for archive_command
4963 : */
4964 : const char *
4965 3448 : show_archive_command(void)
4966 : {
4967 3448 : if (XLogArchivingActive())
4968 4 : return XLogArchiveCommand;
4969 : else
4970 3444 : return "(disabled)";
4971 : }
4972 :
4973 : /*
4974 : * GUC show_hook for in_hot_standby
4975 : */
4976 : const char *
4977 29578 : show_in_hot_standby(void)
4978 : {
4979 : /*
4980 : * We display the actual state based on shared memory, so that this GUC
4981 : * reports up-to-date state if examined intra-query. The underlying
4982 : * variable (in_hot_standby_guc) changes only when we transmit a new value
4983 : * to the client.
4984 : */
4985 29578 : return RecoveryInProgress() ? "on" : "off";
4986 : }
4987 :
4988 : /*
4989 : * Read the control file, set respective GUCs.
4990 : *
4991 : * This is to be called during startup, including a crash recovery cycle,
4992 : * unless in bootstrap mode, where no control file yet exists. As there's no
4993 : * usable shared memory yet (its sizing can depend on the contents of the
4994 : * control file!), first store the contents in local memory. XLOGShmemInit()
4995 : * will then copy it to shared memory later.
4996 : *
4997 : * reset just controls whether previous contents are to be expected (in the
4998 : * reset case, there's a dangling pointer into old shared memory), or not.
4999 : */
5000 : void
5001 1844 : LocalProcessControlFile(bool reset)
5002 : {
5003 : Assert(reset || ControlFile == NULL);
5004 1844 : ControlFile = palloc(sizeof(ControlFileData));
5005 1844 : ReadControlFile();
5006 1844 : }
5007 :
5008 : /*
5009 : * Get the wal_level from the control file. For a standby, this value should be
5010 : * considered as its active wal_level, because it may be different from what
5011 : * was originally configured on standby.
5012 : */
5013 : WalLevel
5014 138 : GetActiveWalLevelOnStandby(void)
5015 : {
5016 138 : return ControlFile->wal_level;
5017 : }
5018 :
5019 : /*
5020 : * Initialization of shared memory for XLOG
5021 : */
5022 : Size
5023 6084 : XLOGShmemSize(void)
5024 : {
5025 : Size size;
5026 :
5027 : /*
5028 : * If the value of wal_buffers is -1, use the preferred auto-tune value.
5029 : * This isn't an amazingly clean place to do this, but we must wait till
5030 : * NBuffers has received its final value, and must do it before using the
5031 : * value of XLOGbuffers to do anything important.
5032 : *
5033 : * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
5034 : * However, if the DBA explicitly set wal_buffers = -1 in the config file,
5035 : * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force
5036 : * the matter with PGC_S_OVERRIDE.
5037 : */
5038 6084 : if (XLOGbuffers == -1)
5039 : {
5040 : char buf[32];
5041 :
5042 2124 : snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5043 2124 : SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
5044 : PGC_S_DYNAMIC_DEFAULT);
5045 2124 : if (XLOGbuffers == -1) /* failed to apply it? */
5046 0 : SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
5047 : PGC_S_OVERRIDE);
5048 : }
5049 : Assert(XLOGbuffers > 0);
5050 :
5051 : /* XLogCtl */
5052 6084 : size = sizeof(XLogCtlData);
5053 :
5054 : /* WAL insertion locks, plus alignment */
5055 6084 : size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
5056 : /* xlblocks array */
5057 6084 : size = add_size(size, mul_size(sizeof(pg_atomic_uint64), XLOGbuffers));
5058 : /* extra alignment padding for XLOG I/O buffers */
5059 6084 : size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
5060 : /* and the buffers themselves */
5061 6084 : size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5062 :
5063 : /*
5064 : * Note: we don't count ControlFileData, it comes out of the "slop factor"
5065 : * added by CreateSharedMemoryAndSemaphores. This lets us use this
5066 : * routine again below to compute the actual allocation size.
5067 : */
5068 :
5069 6084 : return size;
5070 : }
5071 :
5072 : void
5073 2128 : XLOGShmemInit(void)
5074 : {
5075 : bool foundCFile,
5076 : foundXLog;
5077 : char *allocptr;
5078 : int i;
5079 : ControlFileData *localControlFile;
5080 :
5081 : #ifdef WAL_DEBUG
5082 :
5083 : /*
5084 : * Create a memory context for WAL debugging that's exempt from the normal
5085 : * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
5086 : * an allocation fails, but wal_debug is not for production use anyway.
5087 : */
5088 : if (walDebugCxt == NULL)
5089 : {
5090 : walDebugCxt = AllocSetContextCreate(TopMemoryContext,
5091 : "WAL Debug",
5092 : ALLOCSET_DEFAULT_SIZES);
5093 : MemoryContextAllowInCriticalSection(walDebugCxt, true);
5094 : }
5095 : #endif
5096 :
5097 :
5098 2128 : XLogCtl = (XLogCtlData *)
5099 2128 : ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5100 :
5101 2128 : localControlFile = ControlFile;
5102 2128 : ControlFile = (ControlFileData *)
5103 2128 : ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5104 :
5105 2128 : if (foundCFile || foundXLog)
5106 : {
5107 : /* both should be present or neither */
5108 : Assert(foundCFile && foundXLog);
5109 :
5110 : /* Initialize local copy of WALInsertLocks */
5111 0 : WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
5112 :
5113 0 : if (localControlFile)
5114 0 : pfree(localControlFile);
5115 0 : return;
5116 : }
5117 2128 : memset(XLogCtl, 0, sizeof(XLogCtlData));
5118 :
5119 : /*
5120 : * Already have read control file locally, unless in bootstrap mode. Move
5121 : * contents into shared memory.
5122 : */
5123 2128 : if (localControlFile)
5124 : {
5125 1828 : memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
5126 1828 : pfree(localControlFile);
5127 : }
5128 :
5129 : /*
5130 : * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5131 : * multiple of the alignment for same, so no extra alignment padding is
5132 : * needed here.
5133 : */
5134 2128 : allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5135 2128 : XLogCtl->xlblocks = (pg_atomic_uint64 *) allocptr;
5136 2128 : allocptr += sizeof(pg_atomic_uint64) * XLOGbuffers;
5137 :
5138 609836 : for (i = 0; i < XLOGbuffers; i++)
5139 : {
5140 607708 : pg_atomic_init_u64(&XLogCtl->xlblocks[i], InvalidXLogRecPtr);
5141 : }
5142 :
5143 : /* WAL insertion locks. Ensure they're aligned to the full padded size */
5144 2128 : allocptr += sizeof(WALInsertLockPadded) -
5145 2128 : ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5146 2128 : WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5147 : (WALInsertLockPadded *) allocptr;
5148 2128 : allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5149 :
5150 19152 : for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
5151 : {
5152 17024 : LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5153 17024 : pg_atomic_init_u64(&WALInsertLocks[i].l.insertingAt, InvalidXLogRecPtr);
5154 17024 : WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5155 : }
5156 :
5157 : /*
5158 : * Align the start of the page buffers to a full xlog block size boundary.
5159 : * This simplifies some calculations in XLOG insertion. It is also
5160 : * required for O_DIRECT.
5161 : */
5162 2128 : allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5163 2128 : XLogCtl->pages = allocptr;
5164 2128 : memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5165 :
5166 : /*
5167 : * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5168 : * in additional info.)
5169 : */
5170 2128 : XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5171 2128 : XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5172 2128 : XLogCtl->InstallXLogFileSegmentActive = false;
5173 2128 : XLogCtl->WalWriterSleeping = false;
5174 :
5175 2128 : SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5176 2128 : SpinLockInit(&XLogCtl->info_lck);
5177 2128 : pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr);
5178 2128 : pg_atomic_init_u64(&XLogCtl->logWriteResult, InvalidXLogRecPtr);
5179 2128 : pg_atomic_init_u64(&XLogCtl->logFlushResult, InvalidXLogRecPtr);
5180 2128 : pg_atomic_init_u64(&XLogCtl->unloggedLSN, InvalidXLogRecPtr);
5181 :
5182 2128 : pg_atomic_init_u64(&XLogCtl->InitializeReserved, InvalidXLogRecPtr);
5183 2128 : pg_atomic_init_u64(&XLogCtl->InitializedUpTo, InvalidXLogRecPtr);
5184 2128 : ConditionVariableInit(&XLogCtl->InitializedUpToCondVar);
5185 : }
5186 :
5187 : /*
5188 : * This func must be called ONCE on system install. It creates pg_control
5189 : * and the initial XLOG segment.
5190 : */
5191 : void
5192 100 : BootStrapXLOG(uint32 data_checksum_version)
5193 : {
5194 : CheckPoint checkPoint;
5195 : char *buffer;
5196 : XLogPageHeader page;
5197 : XLogLongPageHeader longpage;
5198 : XLogRecord *record;
5199 : char *recptr;
5200 : uint64 sysidentifier;
5201 : struct timeval tv;
5202 : pg_crc32c crc;
5203 :
5204 : /* allow ordinary WAL segment creation, like StartupXLOG() would */
5205 100 : SetInstallXLogFileSegmentActive();
5206 :
5207 : /*
5208 : * Select a hopefully-unique system identifier code for this installation.
5209 : * We use the result of gettimeofday(), including the fractional seconds
5210 : * field, as being about as unique as we can easily get. (Think not to
5211 : * use random(), since it hasn't been seeded and there's no portable way
5212 : * to seed it other than the system clock value...) The upper half of the
5213 : * uint64 value is just the tv_sec part, while the lower half contains the
5214 : * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5215 : * PID for a little extra uniqueness. A person knowing this encoding can
5216 : * determine the initialization time of the installation, which could
5217 : * perhaps be useful sometimes.
5218 : */
5219 100 : gettimeofday(&tv, NULL);
5220 100 : sysidentifier = ((uint64) tv.tv_sec) << 32;
5221 100 : sysidentifier |= ((uint64) tv.tv_usec) << 12;
5222 100 : sysidentifier |= getpid() & 0xFFF;
5223 :
5224 : /* page buffer must be aligned suitably for O_DIRECT */
5225 100 : buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5226 100 : page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5227 100 : memset(page, 0, XLOG_BLCKSZ);
5228 :
5229 : /*
5230 : * Set up information for the initial checkpoint record
5231 : *
5232 : * The initial checkpoint record is written to the beginning of the WAL
5233 : * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5234 : * used, so that we can use 0/0 to mean "before any valid WAL segment".
5235 : */
5236 100 : checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5237 100 : checkPoint.ThisTimeLineID = BootstrapTimeLineID;
5238 100 : checkPoint.PrevTimeLineID = BootstrapTimeLineID;
5239 100 : checkPoint.fullPageWrites = fullPageWrites;
5240 100 : checkPoint.wal_level = wal_level;
5241 : checkPoint.nextXid =
5242 100 : FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
5243 100 : checkPoint.nextOid = FirstGenbkiObjectId;
5244 100 : checkPoint.nextMulti = FirstMultiXactId;
5245 100 : checkPoint.nextMultiOffset = 0;
5246 100 : checkPoint.oldestXid = FirstNormalTransactionId;
5247 100 : checkPoint.oldestXidDB = Template1DbOid;
5248 100 : checkPoint.oldestMulti = FirstMultiXactId;
5249 100 : checkPoint.oldestMultiDB = Template1DbOid;
5250 100 : checkPoint.oldestCommitTsXid = InvalidTransactionId;
5251 100 : checkPoint.newestCommitTsXid = InvalidTransactionId;
5252 100 : checkPoint.time = (pg_time_t) time(NULL);
5253 100 : checkPoint.oldestActiveXid = InvalidTransactionId;
5254 :
5255 100 : TransamVariables->nextXid = checkPoint.nextXid;
5256 100 : TransamVariables->nextOid = checkPoint.nextOid;
5257 100 : TransamVariables->oidCount = 0;
5258 100 : MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5259 100 : AdvanceOldestClogXid(checkPoint.oldestXid);
5260 100 : SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5261 100 : SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5262 100 : SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5263 :
5264 : /* Set up the XLOG page header */
5265 100 : page->xlp_magic = XLOG_PAGE_MAGIC;
5266 100 : page->xlp_info = XLP_LONG_HEADER;
5267 100 : page->xlp_tli = BootstrapTimeLineID;
5268 100 : page->xlp_pageaddr = wal_segment_size;
5269 100 : longpage = (XLogLongPageHeader) page;
5270 100 : longpage->xlp_sysid = sysidentifier;
5271 100 : longpage->xlp_seg_size = wal_segment_size;
5272 100 : longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5273 :
5274 : /* Insert the initial checkpoint record */
5275 100 : recptr = ((char *) page + SizeOfXLogLongPHD);
5276 100 : record = (XLogRecord *) recptr;
5277 100 : record->xl_prev = 0;
5278 100 : record->xl_xid = InvalidTransactionId;
5279 100 : record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5280 100 : record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5281 100 : record->xl_rmid = RM_XLOG_ID;
5282 100 : recptr += SizeOfXLogRecord;
5283 : /* fill the XLogRecordDataHeaderShort struct */
5284 100 : *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5285 100 : *(recptr++) = sizeof(checkPoint);
5286 100 : memcpy(recptr, &checkPoint, sizeof(checkPoint));
5287 100 : recptr += sizeof(checkPoint);
5288 : Assert(recptr - (char *) record == record->xl_tot_len);
5289 :
5290 100 : INIT_CRC32C(crc);
5291 100 : COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5292 100 : COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5293 100 : FIN_CRC32C(crc);
5294 100 : record->xl_crc = crc;
5295 :
5296 : /* Create first XLOG segment file */
5297 100 : openLogTLI = BootstrapTimeLineID;
5298 100 : openLogFile = XLogFileInit(1, BootstrapTimeLineID);
5299 :
5300 : /*
5301 : * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
5302 : * close the file again in a moment.
5303 : */
5304 :
5305 : /* Write the first page with the initial record */
5306 100 : errno = 0;
5307 100 : pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5308 100 : if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5309 : {
5310 : /* if write didn't set errno, assume problem is no disk space */
5311 0 : if (errno == 0)
5312 0 : errno = ENOSPC;
5313 0 : ereport(PANIC,
5314 : (errcode_for_file_access(),
5315 : errmsg("could not write bootstrap write-ahead log file: %m")));
5316 : }
5317 100 : pgstat_report_wait_end();
5318 :
5319 100 : pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5320 100 : if (pg_fsync(openLogFile) != 0)
5321 0 : ereport(PANIC,
5322 : (errcode_for_file_access(),
5323 : errmsg("could not fsync bootstrap write-ahead log file: %m")));
5324 100 : pgstat_report_wait_end();
5325 :
5326 100 : if (close(openLogFile) != 0)
5327 0 : ereport(PANIC,
5328 : (errcode_for_file_access(),
5329 : errmsg("could not close bootstrap write-ahead log file: %m")));
5330 :
5331 100 : openLogFile = -1;
5332 :
5333 : /* Now create pg_control */
5334 100 : InitControlFile(sysidentifier, data_checksum_version);
5335 100 : ControlFile->time = checkPoint.time;
5336 100 : ControlFile->checkPoint = checkPoint.redo;
5337 100 : ControlFile->checkPointCopy = checkPoint;
5338 :
5339 : /* some additional ControlFile fields are set in WriteControlFile() */
5340 100 : WriteControlFile();
5341 :
5342 : /* Bootstrap the commit log, too */
5343 100 : BootStrapCLOG();
5344 100 : BootStrapCommitTs();
5345 100 : BootStrapSUBTRANS();
5346 100 : BootStrapMultiXact();
5347 :
5348 100 : pfree(buffer);
5349 :
5350 : /*
5351 : * Force control file to be read - in contrast to normal processing we'd
5352 : * otherwise never run the checks and GUC related initializations therein.
5353 : */
5354 100 : ReadControlFile();
5355 100 : }
5356 :
5357 : static char *
5358 1644 : str_time(pg_time_t tnow, char *buf, size_t bufsize)
5359 : {
5360 1644 : pg_strftime(buf, bufsize,
5361 : "%Y-%m-%d %H:%M:%S %Z",
5362 1644 : pg_localtime(&tnow, log_timezone));
5363 :
5364 1644 : return buf;
5365 : }
5366 :
5367 : /*
5368 : * Initialize the first WAL segment on new timeline.
5369 : */
5370 : static void
5371 98 : XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
5372 : {
5373 : char xlogfname[MAXFNAMELEN];
5374 : XLogSegNo endLogSegNo;
5375 : XLogSegNo startLogSegNo;
5376 :
5377 : /* we always switch to a new timeline after archive recovery */
5378 : Assert(endTLI != newTLI);
5379 :
5380 : /*
5381 : * Update min recovery point one last time.
5382 : */
5383 98 : UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5384 :
5385 : /*
5386 : * Calculate the last segment on the old timeline, and the first segment
5387 : * on the new timeline. If the switch happens in the middle of a segment,
5388 : * they are the same, but if the switch happens exactly at a segment
5389 : * boundary, startLogSegNo will be endLogSegNo + 1.
5390 : */
5391 98 : XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5392 98 : XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5393 :
5394 : /*
5395 : * Initialize the starting WAL segment for the new timeline. If the switch
5396 : * happens in the middle of a segment, copy data from the last WAL segment
5397 : * of the old timeline up to the switch point, to the starting WAL segment
5398 : * on the new timeline.
5399 : */
5400 98 : if (endLogSegNo == startLogSegNo)
5401 : {
5402 : /*
5403 : * Make a copy of the file on the new timeline.
5404 : *
5405 : * Writing WAL isn't allowed yet, so there are no locking
5406 : * considerations. But we should be just as tense as XLogFileInit to
5407 : * avoid emplacing a bogus file.
5408 : */
5409 80 : XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
5410 80 : XLogSegmentOffset(endOfLog, wal_segment_size));
5411 : }
5412 : else
5413 : {
5414 : /*
5415 : * The switch happened at a segment boundary, so just create the next
5416 : * segment on the new timeline.
5417 : */
5418 : int fd;
5419 :
5420 18 : fd = XLogFileInit(startLogSegNo, newTLI);
5421 :
5422 18 : if (close(fd) != 0)
5423 : {
5424 0 : int save_errno = errno;
5425 :
5426 0 : XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
5427 0 : errno = save_errno;
5428 0 : ereport(ERROR,
5429 : (errcode_for_file_access(),
5430 : errmsg("could not close file \"%s\": %m", xlogfname)));
5431 : }
5432 : }
5433 :
5434 : /*
5435 : * Let's just make real sure there are not .ready or .done flags posted
5436 : * for the new segment.
5437 : */
5438 98 : XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
5439 98 : XLogArchiveCleanup(xlogfname);
5440 98 : }
5441 :
5442 : /*
5443 : * Perform cleanup actions at the conclusion of archive recovery.
5444 : */
5445 : static void
5446 98 : CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
5447 : TimeLineID newTLI)
5448 : {
5449 : /*
5450 : * Execute the recovery_end_command, if any.
5451 : */
5452 98 : if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
5453 4 : ExecuteRecoveryCommand(recoveryEndCommand,
5454 : "recovery_end_command",
5455 : true,
5456 : WAIT_EVENT_RECOVERY_END_COMMAND);
5457 :
5458 : /*
5459 : * We switched to a new timeline. Clean up segments on the old timeline.
5460 : *
5461 : * If there are any higher-numbered segments on the old timeline, remove
5462 : * them. They might contain valid WAL, but they might also be
5463 : * pre-allocated files containing garbage. In any case, they are not part
5464 : * of the new timeline's history so we don't need them.
5465 : */
5466 98 : RemoveNonParentXlogFiles(EndOfLog, newTLI);
5467 :
5468 : /*
5469 : * If the switch happened in the middle of a segment, what to do with the
5470 : * last, partial segment on the old timeline? If we don't archive it, and
5471 : * the server that created the WAL never archives it either (e.g. because
5472 : * it was hit by a meteor), it will never make it to the archive. That's
5473 : * OK from our point of view, because the new segment that we created with
5474 : * the new TLI contains all the WAL from the old timeline up to the switch
5475 : * point. But if you later try to do PITR to the "missing" WAL on the old
5476 : * timeline, recovery won't find it in the archive. It's physically
5477 : * present in the new file with new TLI, but recovery won't look there
5478 : * when it's recovering to the older timeline. On the other hand, if we
5479 : * archive the partial segment, and the original server on that timeline
5480 : * is still running and archives the completed version of the same segment
5481 : * later, it will fail. (We used to do that in 9.4 and below, and it
5482 : * caused such problems).
5483 : *
5484 : * As a compromise, we rename the last segment with the .partial suffix,
5485 : * and archive it. Archive recovery will never try to read .partial
5486 : * segments, so they will normally go unused. But in the odd PITR case,
5487 : * the administrator can copy them manually to the pg_wal directory
5488 : * (removing the suffix). They can be useful in debugging, too.
5489 : *
5490 : * If a .done or .ready file already exists for the old timeline, however,
5491 : * we had already determined that the segment is complete, so we can let
5492 : * it be archived normally. (In particular, if it was restored from the
5493 : * archive to begin with, it's expected to have a .done file).
5494 : */
5495 98 : if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
5496 : XLogArchivingActive())
5497 : {
5498 : char origfname[MAXFNAMELEN];
5499 : XLogSegNo endLogSegNo;
5500 :
5501 20 : XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
5502 20 : XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
5503 :
5504 20 : if (!XLogArchiveIsReadyOrDone(origfname))
5505 : {
5506 : char origpath[MAXPGPATH];
5507 : char partialfname[MAXFNAMELEN];
5508 : char partialpath[MAXPGPATH];
5509 :
5510 : /*
5511 : * If we're summarizing WAL, we can't rename the partial file
5512 : * until the summarizer finishes with it, else it will fail.
5513 : */
5514 12 : if (summarize_wal)
5515 2 : WaitForWalSummarization(EndOfLog);
5516 :
5517 12 : XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
5518 12 : snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
5519 12 : snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
5520 :
5521 : /*
5522 : * Make sure there's no .done or .ready file for the .partial
5523 : * file.
5524 : */
5525 12 : XLogArchiveCleanup(partialfname);
5526 :
5527 12 : durable_rename(origpath, partialpath, ERROR);
5528 12 : XLogArchiveNotify(partialfname);
5529 : }
5530 : }
5531 98 : }
5532 :
5533 : /*
5534 : * Check to see if required parameters are set high enough on this server
5535 : * for various aspects of recovery operation.
5536 : *
5537 : * Note that all the parameters which this function tests need to be
5538 : * listed in Administrator's Overview section in high-availability.sgml.
5539 : * If you change them, don't forget to update the list.
5540 : */
5541 : static void
5542 492 : CheckRequiredParameterValues(void)
5543 : {
5544 : /*
5545 : * For archive recovery, the WAL must be generated with at least 'replica'
5546 : * wal_level.
5547 : */
5548 492 : if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5549 : {
5550 4 : ereport(FATAL,
5551 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
5552 : errmsg("WAL was generated with \"wal_level=minimal\", cannot continue recovering"),
5553 : errdetail("This happens if you temporarily set \"wal_level=minimal\" on the server."),
5554 : errhint("Use a backup taken after setting \"wal_level\" to higher than \"minimal\".")));
5555 : }
5556 :
5557 : /*
5558 : * For Hot Standby, the WAL must be generated with 'replica' mode, and we
5559 : * must have at least as many backend slots as the primary.
5560 : */
5561 488 : if (ArchiveRecoveryRequested && EnableHotStandby)
5562 : {
5563 : /* We ignore autovacuum_worker_slots when we make this test. */
5564 246 : RecoveryRequiresIntParameter("max_connections",
5565 : MaxConnections,
5566 246 : ControlFile->MaxConnections);
5567 246 : RecoveryRequiresIntParameter("max_worker_processes",
5568 : max_worker_processes,
5569 246 : ControlFile->max_worker_processes);
5570 246 : RecoveryRequiresIntParameter("max_wal_senders",
5571 : max_wal_senders,
5572 246 : ControlFile->max_wal_senders);
5573 246 : RecoveryRequiresIntParameter("max_prepared_transactions",
5574 : max_prepared_xacts,
5575 246 : ControlFile->max_prepared_xacts);
5576 246 : RecoveryRequiresIntParameter("max_locks_per_transaction",
5577 : max_locks_per_xact,
5578 246 : ControlFile->max_locks_per_xact);
5579 : }
5580 488 : }
5581 :
5582 : /*
5583 : * This must be called ONCE during postmaster or standalone-backend startup
5584 : */
5585 : void
5586 1844 : StartupXLOG(void)
5587 : {
5588 : XLogCtlInsert *Insert;
5589 : CheckPoint checkPoint;
5590 : bool wasShutdown;
5591 : bool didCrash;
5592 : bool haveTblspcMap;
5593 : bool haveBackupLabel;
5594 : XLogRecPtr EndOfLog;
5595 : TimeLineID EndOfLogTLI;
5596 : TimeLineID newTLI;
5597 : bool performedWalRecovery;
5598 : EndOfWalRecoveryInfo *endOfRecoveryInfo;
5599 : XLogRecPtr abortedRecPtr;
5600 : XLogRecPtr missingContrecPtr;
5601 : TransactionId oldestActiveXID;
5602 1844 : bool promoted = false;
5603 : char timebuf[128];
5604 :
5605 : /*
5606 : * We should have an aux process resource owner to use, and we should not
5607 : * be in a transaction that's installed some other resowner.
5608 : */
5609 : Assert(AuxProcessResourceOwner != NULL);
5610 : Assert(CurrentResourceOwner == NULL ||
5611 : CurrentResourceOwner == AuxProcessResourceOwner);
5612 1844 : CurrentResourceOwner = AuxProcessResourceOwner;
5613 :
5614 : /*
5615 : * Check that contents look valid.
5616 : */
5617 1844 : if (!XRecOffIsValid(ControlFile->checkPoint))
5618 0 : ereport(FATAL,
5619 : (errcode(ERRCODE_DATA_CORRUPTED),
5620 : errmsg("control file contains invalid checkpoint location")));
5621 :
5622 1844 : switch (ControlFile->state)
5623 : {
5624 1430 : case DB_SHUTDOWNED:
5625 :
5626 : /*
5627 : * This is the expected case, so don't be chatty in standalone
5628 : * mode
5629 : */
5630 1430 : ereport(IsPostmasterEnvironment ? LOG : NOTICE,
5631 : (errmsg("database system was shut down at %s",
5632 : str_time(ControlFile->time,
5633 : timebuf, sizeof(timebuf)))));
5634 1430 : break;
5635 :
5636 64 : case DB_SHUTDOWNED_IN_RECOVERY:
5637 64 : ereport(LOG,
5638 : (errmsg("database system was shut down in recovery at %s",
5639 : str_time(ControlFile->time,
5640 : timebuf, sizeof(timebuf)))));
5641 64 : break;
5642 :
5643 0 : case DB_SHUTDOWNING:
5644 0 : ereport(LOG,
5645 : (errmsg("database system shutdown was interrupted; last known up at %s",
5646 : str_time(ControlFile->time,
5647 : timebuf, sizeof(timebuf)))));
5648 0 : break;
5649 :
5650 0 : case DB_IN_CRASH_RECOVERY:
5651 0 : ereport(LOG,
5652 : (errmsg("database system was interrupted while in recovery at %s",
5653 : str_time(ControlFile->time,
5654 : timebuf, sizeof(timebuf))),
5655 : errhint("This probably means that some data is corrupted and"
5656 : " you will have to use the last backup for recovery.")));
5657 0 : break;
5658 :
5659 12 : case DB_IN_ARCHIVE_RECOVERY:
5660 12 : ereport(LOG,
5661 : (errmsg("database system was interrupted while in recovery at log time %s",
5662 : str_time(ControlFile->checkPointCopy.time,
5663 : timebuf, sizeof(timebuf))),
5664 : errhint("If this has occurred more than once some data might be corrupted"
5665 : " and you might need to choose an earlier recovery target.")));
5666 12 : break;
5667 :
5668 338 : case DB_IN_PRODUCTION:
5669 338 : ereport(LOG,
5670 : (errmsg("database system was interrupted; last known up at %s",
5671 : str_time(ControlFile->time,
5672 : timebuf, sizeof(timebuf)))));
5673 338 : break;
5674 :
5675 0 : default:
5676 0 : ereport(FATAL,
5677 : (errcode(ERRCODE_DATA_CORRUPTED),
5678 : errmsg("control file contains invalid database cluster state")));
5679 : }
5680 :
5681 : /* This is just to allow attaching to startup process with a debugger */
5682 : #ifdef XLOG_REPLAY_DELAY
5683 : if (ControlFile->state != DB_SHUTDOWNED)
5684 : pg_usleep(60000000L);
5685 : #endif
5686 :
5687 : /*
5688 : * Verify that pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
5689 : * In cases where someone has performed a copy for PITR, these directories
5690 : * may have been excluded and need to be re-created.
5691 : */
5692 1844 : ValidateXLOGDirectoryStructure();
5693 :
5694 : /* Set up timeout handler needed to report startup progress. */
5695 1844 : if (!IsBootstrapProcessingMode())
5696 1744 : RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
5697 : startup_progress_timeout_handler);
5698 :
5699 : /*----------
5700 : * If we previously crashed, perform a couple of actions:
5701 : *
5702 : * - The pg_wal directory may still include some temporary WAL segments
5703 : * used when creating a new segment, so perform some clean up to not
5704 : * bloat this path. This is done first as there is no point to sync
5705 : * this temporary data.
5706 : *
5707 : * - There might be data which we had written, intending to fsync it, but
5708 : * which we had not actually fsync'd yet. Therefore, a power failure in
5709 : * the near future might cause earlier unflushed writes to be lost, even
5710 : * though more recent data written to disk from here on would be
5711 : * persisted. To avoid that, fsync the entire data directory.
5712 : */
5713 1844 : if (ControlFile->state != DB_SHUTDOWNED &&
5714 414 : ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
5715 : {
5716 350 : RemoveTempXlogFiles();
5717 350 : SyncDataDirectory();
5718 350 : didCrash = true;
5719 : }
5720 : else
5721 1494 : didCrash = false;
5722 :
5723 : /*
5724 : * Prepare for WAL recovery if needed.
5725 : *
5726 : * InitWalRecovery analyzes the control file and the backup label file, if
5727 : * any. It updates the in-memory ControlFile buffer according to the
5728 : * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
5729 : * It also applies the tablespace map file, if any.
5730 : */
5731 1844 : InitWalRecovery(ControlFile, &wasShutdown,
5732 : &haveBackupLabel, &haveTblspcMap);
5733 1844 : checkPoint = ControlFile->checkPointCopy;
5734 :
5735 : /* initialize shared memory variables from the checkpoint record */
5736 1844 : TransamVariables->nextXid = checkPoint.nextXid;
5737 1844 : TransamVariables->nextOid = checkPoint.nextOid;
5738 1844 : TransamVariables->oidCount = 0;
5739 1844 : MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5740 1844 : AdvanceOldestClogXid(checkPoint.oldestXid);
5741 1844 : SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5742 1844 : SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5743 1844 : SetCommitTsLimit(checkPoint.oldestCommitTsXid,
5744 : checkPoint.newestCommitTsXid);
5745 :
5746 : /*
5747 : * Clear out any old relcache cache files. This is *necessary* if we do
5748 : * any WAL replay, since that would probably result in the cache files
5749 : * being out of sync with database reality. In theory we could leave them
5750 : * in place if the database had been cleanly shut down, but it seems
5751 : * safest to just remove them always and let them be rebuilt during the
5752 : * first backend startup. These files needs to be removed from all
5753 : * directories including pg_tblspc, however the symlinks are created only
5754 : * after reading tablespace_map file in case of archive recovery from
5755 : * backup, so needs to clear old relcache files here after creating
5756 : * symlinks.
5757 : */
5758 1844 : RelationCacheInitFileRemove();
5759 :
5760 : /*
5761 : * Initialize replication slots, before there's a chance to remove
5762 : * required resources.
5763 : */
5764 1844 : StartupReplicationSlots();
5765 :
5766 : /*
5767 : * Startup logical state, needs to be setup now so we have proper data
5768 : * during crash recovery.
5769 : */
5770 1842 : StartupReorderBuffer();
5771 :
5772 : /*
5773 : * Startup CLOG. This must be done after TransamVariables->nextXid has
5774 : * been initialized and before we accept connections or begin WAL replay.
5775 : */
5776 1842 : StartupCLOG();
5777 :
5778 : /*
5779 : * Startup MultiXact. We need to do this early to be able to replay
5780 : * truncations.
5781 : */
5782 1842 : StartupMultiXact();
5783 :
5784 : /*
5785 : * Ditto for commit timestamps. Activate the facility if the setting is
5786 : * enabled in the control file, as there should be no tracking of commit
5787 : * timestamps done when the setting was disabled. This facility can be
5788 : * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
5789 : */
5790 1842 : if (ControlFile->track_commit_timestamp)
5791 26 : StartupCommitTs();
5792 :
5793 : /*
5794 : * Recover knowledge about replay progress of known replication partners.
5795 : */
5796 1842 : StartupReplicationOrigin();
5797 :
5798 : /*
5799 : * Initialize unlogged LSN. On a clean shutdown, it's restored from the
5800 : * control file. On recovery, all unlogged relations are blown away, so
5801 : * the unlogged LSN counter can be reset too.
5802 : */
5803 1842 : if (ControlFile->state == DB_SHUTDOWNED)
5804 1416 : pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
5805 1416 : ControlFile->unloggedLSN);
5806 : else
5807 426 : pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
5808 : FirstNormalUnloggedLSN);
5809 :
5810 : /*
5811 : * Copy any missing timeline history files between 'now' and the recovery
5812 : * target timeline from archive to pg_wal. While we don't need those files
5813 : * ourselves - the history file of the recovery target timeline covers all
5814 : * the previous timelines in the history too - a cascading standby server
5815 : * might be interested in them. Or, if you archive the WAL from this
5816 : * server to a different archive than the primary, it'd be good for all
5817 : * the history files to get archived there after failover, so that you can
5818 : * use one of the old timelines as a PITR target. Timeline history files
5819 : * are small, so it's better to copy them unnecessarily than not copy them
5820 : * and regret later.
5821 : */
5822 1842 : restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
5823 :
5824 : /*
5825 : * Before running in recovery, scan pg_twophase and fill in its status to
5826 : * be able to work on entries generated by redo. Doing a scan before
5827 : * taking any recovery action has the merit to discard any 2PC files that
5828 : * are newer than the first record to replay, saving from any conflicts at
5829 : * replay. This avoids as well any subsequent scans when doing recovery
5830 : * of the on-disk two-phase data.
5831 : */
5832 1842 : restoreTwoPhaseData();
5833 :
5834 : /*
5835 : * When starting with crash recovery, reset pgstat data - it might not be
5836 : * valid. Otherwise restore pgstat data. It's safe to do this here,
5837 : * because postmaster will not yet have started any other processes.
5838 : *
5839 : * NB: Restoring replication slot stats relies on slot state to have
5840 : * already been restored from disk.
5841 : *
5842 : * TODO: With a bit of extra work we could just start with a pgstat file
5843 : * associated with the checkpoint redo location we're starting from.
5844 : */
5845 1842 : if (didCrash)
5846 350 : pgstat_discard_stats();
5847 : else
5848 1492 : pgstat_restore_stats();
5849 :
5850 1842 : lastFullPageWrites = checkPoint.fullPageWrites;
5851 :
5852 1842 : RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
5853 1842 : doPageWrites = lastFullPageWrites;
5854 :
5855 : /* REDO */
5856 1842 : if (InRecovery)
5857 : {
5858 : /* Initialize state for RecoveryInProgress() */
5859 426 : SpinLockAcquire(&XLogCtl->info_lck);
5860 426 : if (InArchiveRecovery)
5861 222 : XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
5862 : else
5863 204 : XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5864 426 : SpinLockRelease(&XLogCtl->info_lck);
5865 :
5866 : /*
5867 : * Update pg_control to show that we are recovering and to show the
5868 : * selected checkpoint as the place we are starting from. We also mark
5869 : * pg_control with any minimum recovery stop point obtained from a
5870 : * backup history file.
5871 : *
5872 : * No need to hold ControlFileLock yet, we aren't up far enough.
5873 : */
5874 426 : UpdateControlFile();
5875 :
5876 : /*
5877 : * If there was a backup label file, it's done its job and the info
5878 : * has now been propagated into pg_control. We must get rid of the
5879 : * label file so that if we crash during recovery, we'll pick up at
5880 : * the latest recovery restartpoint instead of going all the way back
5881 : * to the backup start point. It seems prudent though to just rename
5882 : * the file out of the way rather than delete it completely.
5883 : */
5884 426 : if (haveBackupLabel)
5885 : {
5886 142 : unlink(BACKUP_LABEL_OLD);
5887 142 : durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
5888 : }
5889 :
5890 : /*
5891 : * If there was a tablespace_map file, it's done its job and the
5892 : * symlinks have been created. We must get rid of the map file so
5893 : * that if we crash during recovery, we don't create symlinks again.
5894 : * It seems prudent though to just rename the file out of the way
5895 : * rather than delete it completely.
5896 : */
5897 426 : if (haveTblspcMap)
5898 : {
5899 4 : unlink(TABLESPACE_MAP_OLD);
5900 4 : durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
5901 : }
5902 :
5903 : /*
5904 : * Initialize our local copy of minRecoveryPoint. When doing crash
5905 : * recovery we want to replay up to the end of WAL. Particularly, in
5906 : * the case of a promoted standby minRecoveryPoint value in the
5907 : * control file is only updated after the first checkpoint. However,
5908 : * if the instance crashes before the first post-recovery checkpoint
5909 : * is completed then recovery will use a stale location causing the
5910 : * startup process to think that there are still invalid page
5911 : * references when checking for data consistency.
5912 : */
5913 426 : if (InArchiveRecovery)
5914 : {
5915 222 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
5916 222 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
5917 : }
5918 : else
5919 : {
5920 204 : LocalMinRecoveryPoint = InvalidXLogRecPtr;
5921 204 : LocalMinRecoveryPointTLI = 0;
5922 : }
5923 :
5924 : /* Check that the GUCs used to generate the WAL allow recovery */
5925 426 : CheckRequiredParameterValues();
5926 :
5927 : /*
5928 : * We're in recovery, so unlogged relations may be trashed and must be
5929 : * reset. This should be done BEFORE allowing Hot Standby
5930 : * connections, so that read-only backends don't try to read whatever
5931 : * garbage is left over from before.
5932 : */
5933 426 : ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
5934 :
5935 : /*
5936 : * Likewise, delete any saved transaction snapshot files that got left
5937 : * behind by crashed backends.
5938 : */
5939 426 : DeleteAllExportedSnapshotFiles();
5940 :
5941 : /*
5942 : * Initialize for Hot Standby, if enabled. We won't let backends in
5943 : * yet, not until we've reached the min recovery point specified in
5944 : * control file and we've established a recovery snapshot from a
5945 : * running-xacts WAL record.
5946 : */
5947 426 : if (ArchiveRecoveryRequested && EnableHotStandby)
5948 : {
5949 : TransactionId *xids;
5950 : int nxids;
5951 :
5952 210 : ereport(DEBUG1,
5953 : (errmsg_internal("initializing for hot standby")));
5954 :
5955 210 : InitRecoveryTransactionEnvironment();
5956 :
5957 210 : if (wasShutdown)
5958 52 : oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
5959 : else
5960 158 : oldestActiveXID = checkPoint.oldestActiveXid;
5961 : Assert(TransactionIdIsValid(oldestActiveXID));
5962 :
5963 : /* Tell procarray about the range of xids it has to deal with */
5964 210 : ProcArrayInitRecovery(XidFromFullTransactionId(TransamVariables->nextXid));
5965 :
5966 : /*
5967 : * Startup subtrans only. CLOG, MultiXact and commit timestamp
5968 : * have already been started up and other SLRUs are not maintained
5969 : * during recovery and need not be started yet.
5970 : */
5971 210 : StartupSUBTRANS(oldestActiveXID);
5972 :
5973 : /*
5974 : * If we're beginning at a shutdown checkpoint, we know that
5975 : * nothing was running on the primary at this point. So fake-up an
5976 : * empty running-xacts record and use that here and now. Recover
5977 : * additional standby state for prepared transactions.
5978 : */
5979 210 : if (wasShutdown)
5980 : {
5981 : RunningTransactionsData running;
5982 : TransactionId latestCompletedXid;
5983 :
5984 : /* Update pg_subtrans entries for any prepared transactions */
5985 52 : StandbyRecoverPreparedTransactions();
5986 :
5987 : /*
5988 : * Construct a RunningTransactions snapshot representing a
5989 : * shut down server, with only prepared transactions still
5990 : * alive. We're never overflowed at this point because all
5991 : * subxids are listed with their parent prepared transactions.
5992 : */
5993 52 : running.xcnt = nxids;
5994 52 : running.subxcnt = 0;
5995 52 : running.subxid_status = SUBXIDS_IN_SUBTRANS;
5996 52 : running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
5997 52 : running.oldestRunningXid = oldestActiveXID;
5998 52 : latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
5999 52 : TransactionIdRetreat(latestCompletedXid);
6000 : Assert(TransactionIdIsNormal(latestCompletedXid));
6001 52 : running.latestCompletedXid = latestCompletedXid;
6002 52 : running.xids = xids;
6003 :
6004 52 : ProcArrayApplyRecoveryInfo(&running);
6005 : }
6006 : }
6007 :
6008 : /*
6009 : * We're all set for replaying the WAL now. Do it.
6010 : */
6011 426 : PerformWalRecovery();
6012 310 : performedWalRecovery = true;
6013 : }
6014 : else
6015 1416 : performedWalRecovery = false;
6016 :
6017 : /*
6018 : * Finish WAL recovery.
6019 : */
6020 1726 : endOfRecoveryInfo = FinishWalRecovery();
6021 1726 : EndOfLog = endOfRecoveryInfo->endOfLog;
6022 1726 : EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
6023 1726 : abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
6024 1726 : missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
6025 :
6026 : /*
6027 : * Reset ps status display, so as no information related to recovery shows
6028 : * up.
6029 : */
6030 1726 : set_ps_display("");
6031 :
6032 : /*
6033 : * When recovering from a backup (we are in recovery, and archive recovery
6034 : * was requested), complain if we did not roll forward far enough to reach
6035 : * the point where the database is consistent. For regular online
6036 : * backup-from-primary, that means reaching the end-of-backup WAL record
6037 : * (at which point we reset backupStartPoint to be Invalid), for
6038 : * backup-from-replica (which can't inject records into the WAL stream),
6039 : * that point is when we reach the minRecoveryPoint in pg_control (which
6040 : * we purposefully copy last when backing up from a replica). For
6041 : * pg_rewind (which creates a backup_label with a method of "pg_rewind")
6042 : * or snapshot-style backups (which don't), backupEndRequired will be set
6043 : * to false.
6044 : *
6045 : * Note: it is indeed okay to look at the local variable
6046 : * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
6047 : * might be further ahead --- ControlFile->minRecoveryPoint cannot have
6048 : * been advanced beyond the WAL we processed.
6049 : */
6050 1726 : if (InRecovery &&
6051 310 : (EndOfLog < LocalMinRecoveryPoint ||
6052 310 : !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6053 : {
6054 : /*
6055 : * Ran off end of WAL before reaching end-of-backup WAL record, or
6056 : * minRecoveryPoint. That's a bad sign, indicating that you tried to
6057 : * recover from an online backup but never called pg_backup_stop(), or
6058 : * you didn't archive all the WAL needed.
6059 : */
6060 0 : if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
6061 : {
6062 0 : if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired)
6063 0 : ereport(FATAL,
6064 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6065 : errmsg("WAL ends before end of online backup"),
6066 : errhint("All WAL generated while online backup was taken must be available at recovery.")));
6067 : else
6068 0 : ereport(FATAL,
6069 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6070 : errmsg("WAL ends before consistent recovery point")));
6071 : }
6072 : }
6073 :
6074 : /*
6075 : * Reset unlogged relations to the contents of their INIT fork. This is
6076 : * done AFTER recovery is complete so as to include any unlogged relations
6077 : * created during recovery, but BEFORE recovery is marked as having
6078 : * completed successfully. Otherwise we'd not retry if any of the post
6079 : * end-of-recovery steps fail.
6080 : */
6081 1726 : if (InRecovery)
6082 310 : ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
6083 :
6084 : /*
6085 : * Pre-scan prepared transactions to find out the range of XIDs present.
6086 : * This information is not quite needed yet, but it is positioned here so
6087 : * as potential problems are detected before any on-disk change is done.
6088 : */
6089 1726 : oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
6090 :
6091 : /*
6092 : * Allow ordinary WAL segment creation before possibly switching to a new
6093 : * timeline, which creates a new segment, and after the last ReadRecord().
6094 : */
6095 1726 : SetInstallXLogFileSegmentActive();
6096 :
6097 : /*
6098 : * Consider whether we need to assign a new timeline ID.
6099 : *
6100 : * If we did archive recovery, we always assign a new ID. This handles a
6101 : * couple of issues. If we stopped short of the end of WAL during
6102 : * recovery, then we are clearly generating a new timeline and must assign
6103 : * it a unique new ID. Even if we ran to the end, modifying the current
6104 : * last segment is problematic because it may result in trying to
6105 : * overwrite an already-archived copy of that segment, and we encourage
6106 : * DBAs to make their archive_commands reject that. We can dodge the
6107 : * problem by making the new active segment have a new timeline ID.
6108 : *
6109 : * In a normal crash recovery, we can just extend the timeline we were in.
6110 : */
6111 1726 : newTLI = endOfRecoveryInfo->lastRecTLI;
6112 1726 : if (ArchiveRecoveryRequested)
6113 : {
6114 98 : newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
6115 98 : ereport(LOG,
6116 : (errmsg("selected new timeline ID: %u", newTLI)));
6117 :
6118 : /*
6119 : * Make a writable copy of the last WAL segment. (Note that we also
6120 : * have a copy of the last block of the old WAL in
6121 : * endOfRecovery->lastPage; we will use that below.)
6122 : */
6123 98 : XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
6124 :
6125 : /*
6126 : * Remove the signal files out of the way, so that we don't
6127 : * accidentally re-enter archive recovery mode in a subsequent crash.
6128 : */
6129 98 : if (endOfRecoveryInfo->standby_signal_file_found)
6130 92 : durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
6131 :
6132 98 : if (endOfRecoveryInfo->recovery_signal_file_found)
6133 6 : durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
6134 :
6135 : /*
6136 : * Write the timeline history file, and have it archived. After this
6137 : * point (or rather, as soon as the file is archived), the timeline
6138 : * will appear as "taken" in the WAL archive and to any standby
6139 : * servers. If we crash before actually switching to the new
6140 : * timeline, standby servers will nevertheless think that we switched
6141 : * to the new timeline, and will try to connect to the new timeline.
6142 : * To minimize the window for that, try to do as little as possible
6143 : * between here and writing the end-of-recovery record.
6144 : */
6145 98 : writeTimeLineHistory(newTLI, recoveryTargetTLI,
6146 : EndOfLog, endOfRecoveryInfo->recoveryStopReason);
6147 :
6148 98 : ereport(LOG,
6149 : (errmsg("archive recovery complete")));
6150 : }
6151 :
6152 : /* Save the selected TimeLineID in shared memory, too */
6153 1726 : SpinLockAcquire(&XLogCtl->info_lck);
6154 1726 : XLogCtl->InsertTimeLineID = newTLI;
6155 1726 : XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
6156 1726 : SpinLockRelease(&XLogCtl->info_lck);
6157 :
6158 : /*
6159 : * Actually, if WAL ended in an incomplete record, skip the parts that
6160 : * made it through and start writing after the portion that persisted.
6161 : * (It's critical to first write an OVERWRITE_CONTRECORD message, which
6162 : * we'll do as soon as we're open for writing new WAL.)
6163 : */
6164 1726 : if (!XLogRecPtrIsInvalid(missingContrecPtr))
6165 : {
6166 : /*
6167 : * We should only have a missingContrecPtr if we're not switching to a
6168 : * new timeline. When a timeline switch occurs, WAL is copied from the
6169 : * old timeline to the new only up to the end of the last complete
6170 : * record, so there can't be an incomplete WAL record that we need to
6171 : * disregard.
6172 : */
6173 : Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
6174 : Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
6175 24 : EndOfLog = missingContrecPtr;
6176 : }
6177 :
6178 : /*
6179 : * Prepare to write WAL starting at EndOfLog location, and init xlog
6180 : * buffer cache using the block containing the last record from the
6181 : * previous incarnation.
6182 : */
6183 1726 : Insert = &XLogCtl->Insert;
6184 1726 : Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
6185 1726 : Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
6186 :
6187 : /*
6188 : * Tricky point here: lastPage contains the *last* block that the LastRec
6189 : * record spans, not the one it starts in. The last block is indeed the
6190 : * one we want to use.
6191 : */
6192 1726 : if (EndOfLog % XLOG_BLCKSZ != 0)
6193 : {
6194 : char *page;
6195 : int len;
6196 : int firstIdx;
6197 :
6198 1670 : firstIdx = XLogRecPtrToBufIdx(EndOfLog);
6199 1670 : len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
6200 : Assert(len < XLOG_BLCKSZ);
6201 :
6202 : /* Copy the valid part of the last block, and zero the rest */
6203 1670 : page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
6204 1670 : memcpy(page, endOfRecoveryInfo->lastPage, len);
6205 1670 : memset(page + len, 0, XLOG_BLCKSZ - len);
6206 :
6207 1670 : pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
6208 1670 : pg_atomic_write_u64(&XLogCtl->InitializedUpTo, endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
6209 1670 : XLogCtl->InitializedFrom = endOfRecoveryInfo->lastPageBeginPtr;
6210 : }
6211 : else
6212 : {
6213 : /*
6214 : * There is no partial block to copy. Just set InitializedUpTo, and
6215 : * let the first attempt to insert a log record to initialize the next
6216 : * buffer.
6217 : */
6218 56 : pg_atomic_write_u64(&XLogCtl->InitializedUpTo, EndOfLog);
6219 56 : XLogCtl->InitializedFrom = EndOfLog;
6220 : }
6221 1726 : pg_atomic_write_u64(&XLogCtl->InitializeReserved, pg_atomic_read_u64(&XLogCtl->InitializedUpTo));
6222 :
6223 : /*
6224 : * Update local and shared status. This is OK to do without any locks
6225 : * because no other process can be reading or writing WAL yet.
6226 : */
6227 1726 : LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
6228 1726 : pg_atomic_write_u64(&XLogCtl->logInsertResult, EndOfLog);
6229 1726 : pg_atomic_write_u64(&XLogCtl->logWriteResult, EndOfLog);
6230 1726 : pg_atomic_write_u64(&XLogCtl->logFlushResult, EndOfLog);
6231 1726 : XLogCtl->LogwrtRqst.Write = EndOfLog;
6232 1726 : XLogCtl->LogwrtRqst.Flush = EndOfLog;
6233 :
6234 : /*
6235 : * Preallocate additional log files, if wanted.
6236 : */
6237 1726 : PreallocXlogFiles(EndOfLog, newTLI);
6238 :
6239 : /*
6240 : * Okay, we're officially UP.
6241 : */
6242 1726 : InRecovery = false;
6243 :
6244 : /* start the archive_timeout timer and LSN running */
6245 1726 : XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
6246 1726 : XLogCtl->lastSegSwitchLSN = EndOfLog;
6247 :
6248 : /* also initialize latestCompletedXid, to nextXid - 1 */
6249 1726 : LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
6250 1726 : TransamVariables->latestCompletedXid = TransamVariables->nextXid;
6251 1726 : FullTransactionIdRetreat(&TransamVariables->latestCompletedXid);
6252 1726 : LWLockRelease(ProcArrayLock);
6253 :
6254 : /*
6255 : * Start up subtrans, if not already done for hot standby. (commit
6256 : * timestamps are started below, if necessary.)
6257 : */
6258 1726 : if (standbyState == STANDBY_DISABLED)
6259 1628 : StartupSUBTRANS(oldestActiveXID);
6260 :
6261 : /*
6262 : * Perform end of recovery actions for any SLRUs that need it.
6263 : */
6264 1726 : TrimCLOG();
6265 1726 : TrimMultiXact();
6266 :
6267 : /*
6268 : * Reload shared-memory state for prepared transactions. This needs to
6269 : * happen before renaming the last partial segment of the old timeline as
6270 : * it may be possible that we have to recover some transactions from it.
6271 : */
6272 1726 : RecoverPreparedTransactions();
6273 :
6274 : /* Shut down xlogreader */
6275 1726 : ShutdownWalRecovery();
6276 :
6277 : /* Enable WAL writes for this backend only. */
6278 1726 : LocalSetXLogInsertAllowed();
6279 :
6280 : /* If necessary, write overwrite-contrecord before doing anything else */
6281 1726 : if (!XLogRecPtrIsInvalid(abortedRecPtr))
6282 : {
6283 : Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
6284 24 : CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
6285 : }
6286 :
6287 : /*
6288 : * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
6289 : * record before resource manager writes cleanup WAL records or checkpoint
6290 : * record is written.
6291 : */
6292 1726 : Insert->fullPageWrites = lastFullPageWrites;
6293 1726 : UpdateFullPageWrites();
6294 :
6295 : /*
6296 : * Emit checkpoint or end-of-recovery record in XLOG, if required.
6297 : */
6298 1726 : if (performedWalRecovery)
6299 310 : promoted = PerformRecoveryXLogAction();
6300 :
6301 : /*
6302 : * If any of the critical GUCs have changed, log them before we allow
6303 : * backends to write WAL.
6304 : */
6305 1726 : XLogReportParameters();
6306 :
6307 : /* If this is archive recovery, perform post-recovery cleanup actions. */
6308 1726 : if (ArchiveRecoveryRequested)
6309 98 : CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
6310 :
6311 : /*
6312 : * Local WAL inserts enabled, so it's time to finish initialization of
6313 : * commit timestamp.
6314 : */
6315 1726 : CompleteCommitTsInitialization();
6316 :
6317 : /* Clean up EndOfWalRecoveryInfo data to appease Valgrind leak checking */
6318 1726 : if (endOfRecoveryInfo->lastPage)
6319 1694 : pfree(endOfRecoveryInfo->lastPage);
6320 1726 : pfree(endOfRecoveryInfo->recoveryStopReason);
6321 1726 : pfree(endOfRecoveryInfo);
6322 :
6323 : /*
6324 : * All done with end-of-recovery actions.
6325 : *
6326 : * Now allow backends to write WAL and update the control file status in
6327 : * consequence. SharedRecoveryState, that controls if backends can write
6328 : * WAL, is updated while holding ControlFileLock to prevent other backends
6329 : * to look at an inconsistent state of the control file in shared memory.
6330 : * There is still a small window during which backends can write WAL and
6331 : * the control file is still referring to a system not in DB_IN_PRODUCTION
6332 : * state while looking at the on-disk control file.
6333 : *
6334 : * Also, we use info_lck to update SharedRecoveryState to ensure that
6335 : * there are no race conditions concerning visibility of other recent
6336 : * updates to shared memory.
6337 : */
6338 1726 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6339 1726 : ControlFile->state = DB_IN_PRODUCTION;
6340 :
6341 1726 : SpinLockAcquire(&XLogCtl->info_lck);
6342 1726 : XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
6343 1726 : SpinLockRelease(&XLogCtl->info_lck);
6344 :
6345 1726 : UpdateControlFile();
6346 1726 : LWLockRelease(ControlFileLock);
6347 :
6348 : /*
6349 : * Shutdown the recovery environment. This must occur after
6350 : * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
6351 : * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
6352 : * any session building a snapshot will not rely on KnownAssignedXids as
6353 : * RecoveryInProgress() would return false at this stage. This is
6354 : * particularly critical for prepared 2PC transactions, that would still
6355 : * need to be included in snapshots once recovery has ended.
6356 : */
6357 1726 : if (standbyState != STANDBY_DISABLED)
6358 98 : ShutdownRecoveryTransactionEnvironment();
6359 :
6360 : /*
6361 : * If there were cascading standby servers connected to us, nudge any wal
6362 : * sender processes to notice that we've been promoted.
6363 : */
6364 1726 : WalSndWakeup(true, true);
6365 :
6366 : /*
6367 : * If this was a promotion, request an (online) checkpoint now. This isn't
6368 : * required for consistency, but the last restartpoint might be far back,
6369 : * and in case of a crash, recovering from it might take a longer than is
6370 : * appropriate now that we're not in standby mode anymore.
6371 : */
6372 1726 : if (promoted)
6373 84 : RequestCheckpoint(CHECKPOINT_FORCE);
6374 1726 : }
6375 :
6376 : /*
6377 : * Callback from PerformWalRecovery(), called when we switch from crash
6378 : * recovery to archive recovery mode. Updates the control file accordingly.
6379 : */
6380 : void
6381 4 : SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
6382 : {
6383 : /* initialize minRecoveryPoint to this record */
6384 4 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6385 4 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6386 4 : if (ControlFile->minRecoveryPoint < EndRecPtr)
6387 : {
6388 4 : ControlFile->minRecoveryPoint = EndRecPtr;
6389 4 : ControlFile->minRecoveryPointTLI = replayTLI;
6390 : }
6391 : /* update local copy */
6392 4 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
6393 4 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6394 :
6395 : /*
6396 : * The startup process can update its local copy of minRecoveryPoint from
6397 : * this point.
6398 : */
6399 4 : updateMinRecoveryPoint = true;
6400 :
6401 4 : UpdateControlFile();
6402 :
6403 : /*
6404 : * We update SharedRecoveryState while holding the lock on ControlFileLock
6405 : * so both states are consistent in shared memory.
6406 : */
6407 4 : SpinLockAcquire(&XLogCtl->info_lck);
6408 4 : XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
6409 4 : SpinLockRelease(&XLogCtl->info_lck);
6410 :
6411 4 : LWLockRelease(ControlFileLock);
6412 4 : }
6413 :
6414 : /*
6415 : * Callback from PerformWalRecovery(), called when we reach the end of backup.
6416 : * Updates the control file accordingly.
6417 : */
6418 : void
6419 142 : ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
6420 : {
6421 : /*
6422 : * We have reached the end of base backup, as indicated by pg_control. The
6423 : * data on disk is now consistent (unless minRecoveryPoint is further
6424 : * ahead, which can happen if we crashed during previous recovery). Reset
6425 : * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
6426 : * make sure we don't allow starting up at an earlier point even if
6427 : * recovery is stopped and restarted soon after this.
6428 : */
6429 142 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6430 :
6431 142 : if (ControlFile->minRecoveryPoint < EndRecPtr)
6432 : {
6433 134 : ControlFile->minRecoveryPoint = EndRecPtr;
6434 134 : ControlFile->minRecoveryPointTLI = tli;
6435 : }
6436 :
6437 142 : ControlFile->backupStartPoint = InvalidXLogRecPtr;
6438 142 : ControlFile->backupEndPoint = InvalidXLogRecPtr;
6439 142 : ControlFile->backupEndRequired = false;
6440 142 : UpdateControlFile();
6441 :
6442 142 : LWLockRelease(ControlFileLock);
6443 142 : }
6444 :
6445 : /*
6446 : * Perform whatever XLOG actions are necessary at end of REDO.
6447 : *
6448 : * The goal here is to make sure that we'll be able to recover properly if
6449 : * we crash again. If we choose to write a checkpoint, we'll write a shutdown
6450 : * checkpoint rather than an on-line one. This is not particularly critical,
6451 : * but since we may be assigning a new TLI, using a shutdown checkpoint allows
6452 : * us to have the rule that TLI only changes in shutdown checkpoints, which
6453 : * allows some extra error checking in xlog_redo.
6454 : */
6455 : static bool
6456 310 : PerformRecoveryXLogAction(void)
6457 : {
6458 310 : bool promoted = false;
6459 :
6460 : /*
6461 : * Perform a checkpoint to update all our recovery activity to disk.
6462 : *
6463 : * Note that we write a shutdown checkpoint rather than an on-line one.
6464 : * This is not particularly critical, but since we may be assigning a new
6465 : * TLI, using a shutdown checkpoint allows us to have the rule that TLI
6466 : * only changes in shutdown checkpoints, which allows some extra error
6467 : * checking in xlog_redo.
6468 : *
6469 : * In promotion, only create a lightweight end-of-recovery record instead
6470 : * of a full checkpoint. A checkpoint is requested later, after we're
6471 : * fully out of recovery mode and already accepting queries.
6472 : */
6473 408 : if (ArchiveRecoveryRequested && IsUnderPostmaster &&
6474 98 : PromoteIsTriggered())
6475 : {
6476 84 : promoted = true;
6477 :
6478 : /*
6479 : * Insert a special WAL record to mark the end of recovery, since we
6480 : * aren't doing a checkpoint. That means that the checkpointer process
6481 : * may likely be in the middle of a time-smoothed restartpoint and
6482 : * could continue to be for minutes after this. That sounds strange,
6483 : * but the effect is roughly the same and it would be stranger to try
6484 : * to come out of the restartpoint and then checkpoint. We request a
6485 : * checkpoint later anyway, just for safety.
6486 : */
6487 84 : CreateEndOfRecoveryRecord();
6488 : }
6489 : else
6490 : {
6491 226 : RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
6492 : CHECKPOINT_FAST |
6493 : CHECKPOINT_WAIT);
6494 : }
6495 :
6496 310 : return promoted;
6497 : }
6498 :
6499 : /*
6500 : * Is the system still in recovery?
6501 : *
6502 : * Unlike testing InRecovery, this works in any process that's connected to
6503 : * shared memory.
6504 : */
6505 : bool
6506 175702256 : RecoveryInProgress(void)
6507 : {
6508 : /*
6509 : * We check shared state each time only until we leave recovery mode. We
6510 : * can't re-enter recovery, so there's no need to keep checking after the
6511 : * shared variable has once been seen false.
6512 : */
6513 175702256 : if (!LocalRecoveryInProgress)
6514 171220010 : return false;
6515 : else
6516 : {
6517 : /*
6518 : * use volatile pointer to make sure we make a fresh read of the
6519 : * shared variable.
6520 : */
6521 4482246 : volatile XLogCtlData *xlogctl = XLogCtl;
6522 :
6523 4482246 : LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
6524 :
6525 : /*
6526 : * Note: We don't need a memory barrier when we're still in recovery.
6527 : * We might exit recovery immediately after return, so the caller
6528 : * can't rely on 'true' meaning that we're still in recovery anyway.
6529 : */
6530 :
6531 4482246 : return LocalRecoveryInProgress;
6532 : }
6533 : }
6534 :
6535 : /*
6536 : * Returns current recovery state from shared memory.
6537 : *
6538 : * This returned state is kept consistent with the contents of the control
6539 : * file. See details about the possible values of RecoveryState in xlog.h.
6540 : */
6541 : RecoveryState
6542 36538 : GetRecoveryState(void)
6543 : {
6544 : RecoveryState retval;
6545 :
6546 36538 : SpinLockAcquire(&XLogCtl->info_lck);
6547 36538 : retval = XLogCtl->SharedRecoveryState;
6548 36538 : SpinLockRelease(&XLogCtl->info_lck);
6549 :
6550 36538 : return retval;
6551 : }
6552 :
6553 : /*
6554 : * Is this process allowed to insert new WAL records?
6555 : *
6556 : * Ordinarily this is essentially equivalent to !RecoveryInProgress().
6557 : * But we also have provisions for forcing the result "true" or "false"
6558 : * within specific processes regardless of the global state.
6559 : */
6560 : bool
6561 60752422 : XLogInsertAllowed(void)
6562 : {
6563 : /*
6564 : * If value is "unconditionally true" or "unconditionally false", just
6565 : * return it. This provides the normal fast path once recovery is known
6566 : * done.
6567 : */
6568 60752422 : if (LocalXLogInsertAllowed >= 0)
6569 60528472 : return (bool) LocalXLogInsertAllowed;
6570 :
6571 : /*
6572 : * Else, must check to see if we're still in recovery.
6573 : */
6574 223950 : if (RecoveryInProgress())
6575 207334 : return false;
6576 :
6577 : /*
6578 : * On exit from recovery, reset to "unconditionally true", since there is
6579 : * no need to keep checking.
6580 : */
6581 16616 : LocalXLogInsertAllowed = 1;
6582 16616 : return true;
6583 : }
6584 :
6585 : /*
6586 : * Make XLogInsertAllowed() return true in the current process only.
6587 : *
6588 : * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
6589 : * and even call LocalSetXLogInsertAllowed() again after that.
6590 : *
6591 : * Returns the previous value of LocalXLogInsertAllowed.
6592 : */
6593 : static int
6594 1784 : LocalSetXLogInsertAllowed(void)
6595 : {
6596 1784 : int oldXLogAllowed = LocalXLogInsertAllowed;
6597 :
6598 1784 : LocalXLogInsertAllowed = 1;
6599 :
6600 1784 : return oldXLogAllowed;
6601 : }
6602 :
6603 : /*
6604 : * Return the current Redo pointer from shared memory.
6605 : *
6606 : * As a side-effect, the local RedoRecPtr copy is updated.
6607 : */
6608 : XLogRecPtr
6609 587678 : GetRedoRecPtr(void)
6610 : {
6611 : XLogRecPtr ptr;
6612 :
6613 : /*
6614 : * The possibly not up-to-date copy in XlogCtl is enough. Even if we
6615 : * grabbed a WAL insertion lock to read the authoritative value in
6616 : * Insert->RedoRecPtr, someone might update it just after we've released
6617 : * the lock.
6618 : */
6619 587678 : SpinLockAcquire(&XLogCtl->info_lck);
6620 587678 : ptr = XLogCtl->RedoRecPtr;
6621 587678 : SpinLockRelease(&XLogCtl->info_lck);
6622 :
6623 587678 : if (RedoRecPtr < ptr)
6624 2946 : RedoRecPtr = ptr;
6625 :
6626 587678 : return RedoRecPtr;
6627 : }
6628 :
6629 : /*
6630 : * Return information needed to decide whether a modified block needs a
6631 : * full-page image to be included in the WAL record.
6632 : *
6633 : * The returned values are cached copies from backend-private memory, and
6634 : * possibly out-of-date or, indeed, uninitialized, in which case they will
6635 : * be InvalidXLogRecPtr and false, respectively. XLogInsertRecord will
6636 : * re-check them against up-to-date values, while holding the WAL insert lock.
6637 : */
6638 : void
6639 29364456 : GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
6640 : {
6641 29364456 : *RedoRecPtr_p = RedoRecPtr;
6642 29364456 : *doPageWrites_p = doPageWrites;
6643 29364456 : }
6644 :
6645 : /*
6646 : * GetInsertRecPtr -- Returns the current insert position.
6647 : *
6648 : * NOTE: The value *actually* returned is the position of the last full
6649 : * xlog page. It lags behind the real insert position by at most 1 page.
6650 : * For that, we don't need to scan through WAL insertion locks, and an
6651 : * approximation is enough for the current usage of this function.
6652 : */
6653 : XLogRecPtr
6654 14054 : GetInsertRecPtr(void)
6655 : {
6656 : XLogRecPtr recptr;
6657 :
6658 14054 : SpinLockAcquire(&XLogCtl->info_lck);
6659 14054 : recptr = XLogCtl->LogwrtRqst.Write;
6660 14054 : SpinLockRelease(&XLogCtl->info_lck);
6661 :
6662 14054 : return recptr;
6663 : }
6664 :
6665 : /*
6666 : * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
6667 : * position known to be fsync'd to disk. This should only be used on a
6668 : * system that is known not to be in recovery.
6669 : */
6670 : XLogRecPtr
6671 434640 : GetFlushRecPtr(TimeLineID *insertTLI)
6672 : {
6673 : Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
6674 :
6675 434640 : RefreshXLogWriteResult(LogwrtResult);
6676 :
6677 : /*
6678 : * If we're writing and flushing WAL, the time line can't be changing, so
6679 : * no lock is required.
6680 : */
6681 434640 : if (insertTLI)
6682 51464 : *insertTLI = XLogCtl->InsertTimeLineID;
6683 :
6684 434640 : return LogwrtResult.Flush;
6685 : }
6686 :
6687 : /*
6688 : * GetWALInsertionTimeLine -- Returns the current timeline of a system that
6689 : * is not in recovery.
6690 : */
6691 : TimeLineID
6692 227104 : GetWALInsertionTimeLine(void)
6693 : {
6694 : Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
6695 :
6696 : /* Since the value can't be changing, no lock is required. */
6697 227104 : return XLogCtl->InsertTimeLineID;
6698 : }
6699 :
6700 : /*
6701 : * GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns
6702 : * the WAL insertion timeline; else, returns 0. Wherever possible, use
6703 : * GetWALInsertionTimeLine() instead, since it's cheaper. Note that this
6704 : * function decides recovery has ended as soon as the insert TLI is set, which
6705 : * happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE.
6706 : */
6707 : TimeLineID
6708 0 : GetWALInsertionTimeLineIfSet(void)
6709 : {
6710 : TimeLineID insertTLI;
6711 :
6712 0 : SpinLockAcquire(&XLogCtl->info_lck);
6713 0 : insertTLI = XLogCtl->InsertTimeLineID;
6714 0 : SpinLockRelease(&XLogCtl->info_lck);
6715 :
6716 0 : return insertTLI;
6717 : }
6718 :
6719 : /*
6720 : * GetLastImportantRecPtr -- Returns the LSN of the last important record
6721 : * inserted. All records not explicitly marked as unimportant are considered
6722 : * important.
6723 : *
6724 : * The LSN is determined by computing the maximum of
6725 : * WALInsertLocks[i].lastImportantAt.
6726 : */
6727 : XLogRecPtr
6728 3038 : GetLastImportantRecPtr(void)
6729 : {
6730 3038 : XLogRecPtr res = InvalidXLogRecPtr;
6731 : int i;
6732 :
6733 27342 : for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
6734 : {
6735 : XLogRecPtr last_important;
6736 :
6737 : /*
6738 : * Need to take a lock to prevent torn reads of the LSN, which are
6739 : * possible on some of the supported platforms. WAL insert locks only
6740 : * support exclusive mode, so we have to use that.
6741 : */
6742 24304 : LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
6743 24304 : last_important = WALInsertLocks[i].l.lastImportantAt;
6744 24304 : LWLockRelease(&WALInsertLocks[i].l.lock);
6745 :
6746 24304 : if (res < last_important)
6747 5212 : res = last_important;
6748 : }
6749 :
6750 3038 : return res;
6751 : }
6752 :
6753 : /*
6754 : * Get the time and LSN of the last xlog segment switch
6755 : */
6756 : pg_time_t
6757 0 : GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
6758 : {
6759 : pg_time_t result;
6760 :
6761 : /* Need WALWriteLock, but shared lock is sufficient */
6762 0 : LWLockAcquire(WALWriteLock, LW_SHARED);
6763 0 : result = XLogCtl->lastSegSwitchTime;
6764 0 : *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
6765 0 : LWLockRelease(WALWriteLock);
6766 :
6767 0 : return result;
6768 : }
6769 :
6770 : /*
6771 : * This must be called ONCE during postmaster or standalone-backend shutdown
6772 : */
6773 : void
6774 1248 : ShutdownXLOG(int code, Datum arg)
6775 : {
6776 : /*
6777 : * We should have an aux process resource owner to use, and we should not
6778 : * be in a transaction that's installed some other resowner.
6779 : */
6780 : Assert(AuxProcessResourceOwner != NULL);
6781 : Assert(CurrentResourceOwner == NULL ||
6782 : CurrentResourceOwner == AuxProcessResourceOwner);
6783 1248 : CurrentResourceOwner = AuxProcessResourceOwner;
6784 :
6785 : /* Don't be chatty in standalone mode */
6786 1248 : ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6787 : (errmsg("shutting down")));
6788 :
6789 : /*
6790 : * Signal walsenders to move to stopping state.
6791 : */
6792 1248 : WalSndInitStopping();
6793 :
6794 : /*
6795 : * Wait for WAL senders to be in stopping state. This prevents commands
6796 : * from writing new WAL.
6797 : */
6798 1248 : WalSndWaitStopping();
6799 :
6800 1248 : if (RecoveryInProgress())
6801 110 : CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST);
6802 : else
6803 : {
6804 : /*
6805 : * If archiving is enabled, rotate the last XLOG file so that all the
6806 : * remaining records are archived (postmaster wakes up the archiver
6807 : * process one more time at the end of shutdown). The checkpoint
6808 : * record will go to the next XLOG file and won't be archived (yet).
6809 : */
6810 1138 : if (XLogArchivingActive())
6811 28 : RequestXLogSwitch(false);
6812 :
6813 1138 : CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST);
6814 : }
6815 1248 : }
6816 :
6817 : /*
6818 : * Log start of a checkpoint.
6819 : */
6820 : static void
6821 2778 : LogCheckpointStart(int flags, bool restartpoint)
6822 : {
6823 2778 : if (restartpoint)
6824 378 : ereport(LOG,
6825 : /* translator: the placeholders show checkpoint options */
6826 : (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
6827 : (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6828 : (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6829 : (flags & CHECKPOINT_FAST) ? " fast" : "",
6830 : (flags & CHECKPOINT_FORCE) ? " force" : "",
6831 : (flags & CHECKPOINT_WAIT) ? " wait" : "",
6832 : (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
6833 : (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
6834 : (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : "")));
6835 : else
6836 2400 : ereport(LOG,
6837 : /* translator: the placeholders show checkpoint options */
6838 : (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
6839 : (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6840 : (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6841 : (flags & CHECKPOINT_FAST) ? " fast" : "",
6842 : (flags & CHECKPOINT_FORCE) ? " force" : "",
6843 : (flags & CHECKPOINT_WAIT) ? " wait" : "",
6844 : (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
6845 : (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
6846 : (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : "")));
6847 2778 : }
6848 :
6849 : /*
6850 : * Log end of a checkpoint.
6851 : */
6852 : static void
6853 3362 : LogCheckpointEnd(bool restartpoint)
6854 : {
6855 : long write_msecs,
6856 : sync_msecs,
6857 : total_msecs,
6858 : longest_msecs,
6859 : average_msecs;
6860 : uint64 average_sync_time;
6861 :
6862 3362 : CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
6863 :
6864 3362 : write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
6865 : CheckpointStats.ckpt_sync_t);
6866 :
6867 3362 : sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
6868 : CheckpointStats.ckpt_sync_end_t);
6869 :
6870 : /* Accumulate checkpoint timing summary data, in milliseconds. */
6871 3362 : PendingCheckpointerStats.write_time += write_msecs;
6872 3362 : PendingCheckpointerStats.sync_time += sync_msecs;
6873 :
6874 : /*
6875 : * All of the published timing statistics are accounted for. Only
6876 : * continue if a log message is to be written.
6877 : */
6878 3362 : if (!log_checkpoints)
6879 584 : return;
6880 :
6881 2778 : total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
6882 : CheckpointStats.ckpt_end_t);
6883 :
6884 : /*
6885 : * Timing values returned from CheckpointStats are in microseconds.
6886 : * Convert to milliseconds for consistent printing.
6887 : */
6888 2778 : longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
6889 :
6890 2778 : average_sync_time = 0;
6891 2778 : if (CheckpointStats.ckpt_sync_rels > 0)
6892 0 : average_sync_time = CheckpointStats.ckpt_agg_sync_time /
6893 0 : CheckpointStats.ckpt_sync_rels;
6894 2778 : average_msecs = (long) ((average_sync_time + 999) / 1000);
6895 :
6896 : /*
6897 : * ControlFileLock is not required to see ControlFile->checkPoint and
6898 : * ->checkPointCopy here as we are the only updator of those variables at
6899 : * this moment.
6900 : */
6901 2778 : if (restartpoint)
6902 378 : ereport(LOG,
6903 : (errmsg("restartpoint complete: wrote %d buffers (%.1f%%), "
6904 : "wrote %d SLRU buffers; %d WAL file(s) added, "
6905 : "%d removed, %d recycled; write=%ld.%03d s, "
6906 : "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
6907 : "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
6908 : "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X",
6909 : CheckpointStats.ckpt_bufs_written,
6910 : (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6911 : CheckpointStats.ckpt_slru_written,
6912 : CheckpointStats.ckpt_segs_added,
6913 : CheckpointStats.ckpt_segs_removed,
6914 : CheckpointStats.ckpt_segs_recycled,
6915 : write_msecs / 1000, (int) (write_msecs % 1000),
6916 : sync_msecs / 1000, (int) (sync_msecs % 1000),
6917 : total_msecs / 1000, (int) (total_msecs % 1000),
6918 : CheckpointStats.ckpt_sync_rels,
6919 : longest_msecs / 1000, (int) (longest_msecs % 1000),
6920 : average_msecs / 1000, (int) (average_msecs % 1000),
6921 : (int) (PrevCheckPointDistance / 1024.0),
6922 : (int) (CheckPointDistanceEstimate / 1024.0),
6923 : LSN_FORMAT_ARGS(ControlFile->checkPoint),
6924 : LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
6925 : else
6926 2400 : ereport(LOG,
6927 : (errmsg("checkpoint complete: wrote %d buffers (%.1f%%), "
6928 : "wrote %d SLRU buffers; %d WAL file(s) added, "
6929 : "%d removed, %d recycled; write=%ld.%03d s, "
6930 : "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
6931 : "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
6932 : "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X",
6933 : CheckpointStats.ckpt_bufs_written,
6934 : (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6935 : CheckpointStats.ckpt_slru_written,
6936 : CheckpointStats.ckpt_segs_added,
6937 : CheckpointStats.ckpt_segs_removed,
6938 : CheckpointStats.ckpt_segs_recycled,
6939 : write_msecs / 1000, (int) (write_msecs % 1000),
6940 : sync_msecs / 1000, (int) (sync_msecs % 1000),
6941 : total_msecs / 1000, (int) (total_msecs % 1000),
6942 : CheckpointStats.ckpt_sync_rels,
6943 : longest_msecs / 1000, (int) (longest_msecs % 1000),
6944 : average_msecs / 1000, (int) (average_msecs % 1000),
6945 : (int) (PrevCheckPointDistance / 1024.0),
6946 : (int) (CheckPointDistanceEstimate / 1024.0),
6947 : LSN_FORMAT_ARGS(ControlFile->checkPoint),
6948 : LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
6949 : }
6950 :
6951 : /*
6952 : * Update the estimate of distance between checkpoints.
6953 : *
6954 : * The estimate is used to calculate the number of WAL segments to keep
6955 : * preallocated, see XLOGfileslop().
6956 : */
6957 : static void
6958 3362 : UpdateCheckPointDistanceEstimate(uint64 nbytes)
6959 : {
6960 : /*
6961 : * To estimate the number of segments consumed between checkpoints, keep a
6962 : * moving average of the amount of WAL generated in previous checkpoint
6963 : * cycles. However, if the load is bursty, with quiet periods and busy
6964 : * periods, we want to cater for the peak load. So instead of a plain
6965 : * moving average, let the average decline slowly if the previous cycle
6966 : * used less WAL than estimated, but bump it up immediately if it used
6967 : * more.
6968 : *
6969 : * When checkpoints are triggered by max_wal_size, this should converge to
6970 : * CheckpointSegments * wal_segment_size,
6971 : *
6972 : * Note: This doesn't pay any attention to what caused the checkpoint.
6973 : * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
6974 : * starting a base backup, are counted the same as those created
6975 : * automatically. The slow-decline will largely mask them out, if they are
6976 : * not frequent. If they are frequent, it seems reasonable to count them
6977 : * in as any others; if you issue a manual checkpoint every 5 minutes and
6978 : * never let a timed checkpoint happen, it makes sense to base the
6979 : * preallocation on that 5 minute interval rather than whatever
6980 : * checkpoint_timeout is set to.
6981 : */
6982 3362 : PrevCheckPointDistance = nbytes;
6983 3362 : if (CheckPointDistanceEstimate < nbytes)
6984 1426 : CheckPointDistanceEstimate = nbytes;
6985 : else
6986 1936 : CheckPointDistanceEstimate =
6987 1936 : (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
6988 3362 : }
6989 :
6990 : /*
6991 : * Update the ps display for a process running a checkpoint. Note that
6992 : * this routine should not do any allocations so as it can be called
6993 : * from a critical section.
6994 : */
6995 : static void
6996 6724 : update_checkpoint_display(int flags, bool restartpoint, bool reset)
6997 : {
6998 : /*
6999 : * The status is reported only for end-of-recovery and shutdown
7000 : * checkpoints or shutdown restartpoints. Updating the ps display is
7001 : * useful in those situations as it may not be possible to rely on
7002 : * pg_stat_activity to see the status of the checkpointer or the startup
7003 : * process.
7004 : */
7005 6724 : if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
7006 4244 : return;
7007 :
7008 2480 : if (reset)
7009 1240 : set_ps_display("");
7010 : else
7011 : {
7012 : char activitymsg[128];
7013 :
7014 3720 : snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
7015 1240 : (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
7016 1240 : (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
7017 : restartpoint ? "restartpoint" : "checkpoint");
7018 1240 : set_ps_display(activitymsg);
7019 : }
7020 : }
7021 :
7022 :
7023 : /*
7024 : * Perform a checkpoint --- either during shutdown, or on-the-fly
7025 : *
7026 : * flags is a bitwise OR of the following:
7027 : * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7028 : * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7029 : * CHECKPOINT_FAST: finish the checkpoint ASAP, ignoring
7030 : * checkpoint_completion_target parameter.
7031 : * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
7032 : * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7033 : * CHECKPOINT_END_OF_RECOVERY).
7034 : * CHECKPOINT_FLUSH_UNLOGGED: also flush buffers of unlogged tables.
7035 : *
7036 : * Note: flags contains other bits, of interest here only for logging purposes.
7037 : * In particular note that this routine is synchronous and does not pay
7038 : * attention to CHECKPOINT_WAIT.
7039 : *
7040 : * If !shutdown then we are writing an online checkpoint. An XLOG_CHECKPOINT_REDO
7041 : * record is inserted into WAL at the logical location of the checkpoint, before
7042 : * flushing anything to disk, and when the checkpoint is eventually completed,
7043 : * and it is from this point that WAL replay will begin in the case of a recovery
7044 : * from this checkpoint. Once everything is written to disk, an
7045 : * XLOG_CHECKPOINT_ONLINE record is written to complete the checkpoint, and
7046 : * points back to the earlier XLOG_CHECKPOINT_REDO record. This mechanism allows
7047 : * other write-ahead log records to be written while the checkpoint is in
7048 : * progress, but we must be very careful about order of operations. This function
7049 : * may take many minutes to execute on a busy system.
7050 : *
7051 : * On the other hand, when shutdown is true, concurrent insertion into the
7052 : * write-ahead log is impossible, so there is no need for two separate records.
7053 : * In this case, we only insert an XLOG_CHECKPOINT_SHUTDOWN record, and it's
7054 : * both the record marking the completion of the checkpoint and the location
7055 : * from which WAL replay would begin if needed.
7056 : *
7057 : * Returns true if a new checkpoint was performed, or false if it was skipped
7058 : * because the system was idle.
7059 : */
7060 : bool
7061 2984 : CreateCheckPoint(int flags)
7062 : {
7063 : bool shutdown;
7064 : CheckPoint checkPoint;
7065 : XLogRecPtr recptr;
7066 : XLogSegNo _logSegNo;
7067 2984 : XLogCtlInsert *Insert = &XLogCtl->Insert;
7068 : uint32 freespace;
7069 : XLogRecPtr PriorRedoPtr;
7070 : XLogRecPtr last_important_lsn;
7071 : VirtualTransactionId *vxids;
7072 : int nvxids;
7073 2984 : int oldXLogAllowed = 0;
7074 :
7075 : /*
7076 : * An end-of-recovery checkpoint is really a shutdown checkpoint, just
7077 : * issued at a different time.
7078 : */
7079 2984 : if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7080 1196 : shutdown = true;
7081 : else
7082 1788 : shutdown = false;
7083 :
7084 : /* sanity check */
7085 2984 : if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
7086 0 : elog(ERROR, "can't create a checkpoint during recovery");
7087 :
7088 : /*
7089 : * Prepare to accumulate statistics.
7090 : *
7091 : * Note: because it is possible for log_checkpoints to change while a
7092 : * checkpoint proceeds, we always accumulate stats, even if
7093 : * log_checkpoints is currently off.
7094 : */
7095 32824 : MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7096 2984 : CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7097 :
7098 : /*
7099 : * Let smgr prepare for checkpoint; this has to happen outside the
7100 : * critical section and before we determine the REDO pointer. Note that
7101 : * smgr must not do anything that'd have to be undone if we decide no
7102 : * checkpoint is needed.
7103 : */
7104 2984 : SyncPreCheckpoint();
7105 :
7106 : /*
7107 : * Use a critical section to force system panic if we have trouble.
7108 : */
7109 2984 : START_CRIT_SECTION();
7110 :
7111 2984 : if (shutdown)
7112 : {
7113 1196 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7114 1196 : ControlFile->state = DB_SHUTDOWNING;
7115 1196 : UpdateControlFile();
7116 1196 : LWLockRelease(ControlFileLock);
7117 : }
7118 :
7119 : /* Begin filling in the checkpoint WAL record */
7120 35808 : MemSet(&checkPoint, 0, sizeof(checkPoint));
7121 2984 : checkPoint.time = (pg_time_t) time(NULL);
7122 :
7123 : /*
7124 : * For Hot Standby, derive the oldestActiveXid before we fix the redo
7125 : * pointer. This allows us to begin accumulating changes to assemble our
7126 : * starting snapshot of locks and transactions.
7127 : */
7128 2984 : if (!shutdown && XLogStandbyInfoActive())
7129 1696 : checkPoint.oldestActiveXid = GetOldestActiveTransactionId(false, true);
7130 : else
7131 1288 : checkPoint.oldestActiveXid = InvalidTransactionId;
7132 :
7133 : /*
7134 : * Get location of last important record before acquiring insert locks (as
7135 : * GetLastImportantRecPtr() also locks WAL locks).
7136 : */
7137 2984 : last_important_lsn = GetLastImportantRecPtr();
7138 :
7139 : /*
7140 : * If this isn't a shutdown or forced checkpoint, and if there has been no
7141 : * WAL activity requiring a checkpoint, skip it. The idea here is to
7142 : * avoid inserting duplicate checkpoints when the system is idle.
7143 : */
7144 2984 : if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
7145 : CHECKPOINT_FORCE)) == 0)
7146 : {
7147 370 : if (last_important_lsn == ControlFile->checkPoint)
7148 : {
7149 0 : END_CRIT_SECTION();
7150 0 : ereport(DEBUG1,
7151 : (errmsg_internal("checkpoint skipped because system is idle")));
7152 0 : return false;
7153 : }
7154 : }
7155 :
7156 : /*
7157 : * An end-of-recovery checkpoint is created before anyone is allowed to
7158 : * write WAL. To allow us to write the checkpoint record, temporarily
7159 : * enable XLogInsertAllowed.
7160 : */
7161 2984 : if (flags & CHECKPOINT_END_OF_RECOVERY)
7162 58 : oldXLogAllowed = LocalSetXLogInsertAllowed();
7163 :
7164 2984 : checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
7165 2984 : if (flags & CHECKPOINT_END_OF_RECOVERY)
7166 58 : checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
7167 : else
7168 2926 : checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
7169 :
7170 : /*
7171 : * We must block concurrent insertions while examining insert state.
7172 : */
7173 2984 : WALInsertLockAcquireExclusive();
7174 :
7175 2984 : checkPoint.fullPageWrites = Insert->fullPageWrites;
7176 2984 : checkPoint.wal_level = wal_level;
7177 :
7178 2984 : if (shutdown)
7179 : {
7180 1196 : XLogRecPtr curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
7181 :
7182 : /*
7183 : * Compute new REDO record ptr = location of next XLOG record.
7184 : *
7185 : * Since this is a shutdown checkpoint, there can't be any concurrent
7186 : * WAL insertion.
7187 : */
7188 1196 : freespace = INSERT_FREESPACE(curInsert);
7189 1196 : if (freespace == 0)
7190 : {
7191 0 : if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
7192 0 : curInsert += SizeOfXLogLongPHD;
7193 : else
7194 0 : curInsert += SizeOfXLogShortPHD;
7195 : }
7196 1196 : checkPoint.redo = curInsert;
7197 :
7198 : /*
7199 : * Here we update the shared RedoRecPtr for future XLogInsert calls;
7200 : * this must be done while holding all the insertion locks.
7201 : *
7202 : * Note: if we fail to complete the checkpoint, RedoRecPtr will be
7203 : * left pointing past where it really needs to point. This is okay;
7204 : * the only consequence is that XLogInsert might back up whole buffers
7205 : * that it didn't really need to. We can't postpone advancing
7206 : * RedoRecPtr because XLogInserts that happen while we are dumping
7207 : * buffers must assume that their buffer changes are not included in
7208 : * the checkpoint.
7209 : */
7210 1196 : RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
7211 : }
7212 :
7213 : /*
7214 : * Now we can release the WAL insertion locks, allowing other xacts to
7215 : * proceed while we are flushing disk buffers.
7216 : */
7217 2984 : WALInsertLockRelease();
7218 :
7219 : /*
7220 : * If this is an online checkpoint, we have not yet determined the redo
7221 : * point. We do so now by inserting the special XLOG_CHECKPOINT_REDO
7222 : * record; the LSN at which it starts becomes the new redo pointer. We
7223 : * don't do this for a shutdown checkpoint, because in that case no WAL
7224 : * can be written between the redo point and the insertion of the
7225 : * checkpoint record itself, so the checkpoint record itself serves to
7226 : * mark the redo point.
7227 : */
7228 2984 : if (!shutdown)
7229 : {
7230 : /* Include WAL level in record for WAL summarizer's benefit. */
7231 1788 : XLogBeginInsert();
7232 1788 : XLogRegisterData(&wal_level, sizeof(wal_level));
7233 1788 : (void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
7234 :
7235 : /*
7236 : * XLogInsertRecord will have updated XLogCtl->Insert.RedoRecPtr in
7237 : * shared memory and RedoRecPtr in backend-local memory, but we need
7238 : * to copy that into the record that will be inserted when the
7239 : * checkpoint is complete.
7240 : */
7241 1788 : checkPoint.redo = RedoRecPtr;
7242 : }
7243 :
7244 : /* Update the info_lck-protected copy of RedoRecPtr as well */
7245 2984 : SpinLockAcquire(&XLogCtl->info_lck);
7246 2984 : XLogCtl->RedoRecPtr = checkPoint.redo;
7247 2984 : SpinLockRelease(&XLogCtl->info_lck);
7248 :
7249 : /*
7250 : * If enabled, log checkpoint start. We postpone this until now so as not
7251 : * to log anything if we decided to skip the checkpoint.
7252 : */
7253 2984 : if (log_checkpoints)
7254 2400 : LogCheckpointStart(flags, false);
7255 :
7256 : /* Update the process title */
7257 2984 : update_checkpoint_display(flags, false, false);
7258 :
7259 : TRACE_POSTGRESQL_CHECKPOINT_START(flags);
7260 :
7261 : /*
7262 : * Get the other info we need for the checkpoint record.
7263 : *
7264 : * We don't need to save oldestClogXid in the checkpoint, it only matters
7265 : * for the short period in which clog is being truncated, and if we crash
7266 : * during that we'll redo the clog truncation and fix up oldestClogXid
7267 : * there.
7268 : */
7269 2984 : LWLockAcquire(XidGenLock, LW_SHARED);
7270 2984 : checkPoint.nextXid = TransamVariables->nextXid;
7271 2984 : checkPoint.oldestXid = TransamVariables->oldestXid;
7272 2984 : checkPoint.oldestXidDB = TransamVariables->oldestXidDB;
7273 2984 : LWLockRelease(XidGenLock);
7274 :
7275 2984 : LWLockAcquire(CommitTsLock, LW_SHARED);
7276 2984 : checkPoint.oldestCommitTsXid = TransamVariables->oldestCommitTsXid;
7277 2984 : checkPoint.newestCommitTsXid = TransamVariables->newestCommitTsXid;
7278 2984 : LWLockRelease(CommitTsLock);
7279 :
7280 2984 : LWLockAcquire(OidGenLock, LW_SHARED);
7281 2984 : checkPoint.nextOid = TransamVariables->nextOid;
7282 2984 : if (!shutdown)
7283 1788 : checkPoint.nextOid += TransamVariables->oidCount;
7284 2984 : LWLockRelease(OidGenLock);
7285 :
7286 2984 : MultiXactGetCheckptMulti(shutdown,
7287 : &checkPoint.nextMulti,
7288 : &checkPoint.nextMultiOffset,
7289 : &checkPoint.oldestMulti,
7290 : &checkPoint.oldestMultiDB);
7291 :
7292 : /*
7293 : * Having constructed the checkpoint record, ensure all shmem disk buffers
7294 : * and commit-log buffers are flushed to disk.
7295 : *
7296 : * This I/O could fail for various reasons. If so, we will fail to
7297 : * complete the checkpoint, but there is no reason to force a system
7298 : * panic. Accordingly, exit critical section while doing it.
7299 : */
7300 2984 : END_CRIT_SECTION();
7301 :
7302 : /*
7303 : * In some cases there are groups of actions that must all occur on one
7304 : * side or the other of a checkpoint record. Before flushing the
7305 : * checkpoint record we must explicitly wait for any backend currently
7306 : * performing those groups of actions.
7307 : *
7308 : * One example is end of transaction, so we must wait for any transactions
7309 : * that are currently in commit critical sections. If an xact inserted
7310 : * its commit record into XLOG just before the REDO point, then a crash
7311 : * restart from the REDO point would not replay that record, which means
7312 : * that our flushing had better include the xact's update of pg_xact. So
7313 : * we wait till he's out of his commit critical section before proceeding.
7314 : * See notes in RecordTransactionCommit().
7315 : *
7316 : * Because we've already released the insertion locks, this test is a bit
7317 : * fuzzy: it is possible that we will wait for xacts we didn't really need
7318 : * to wait for. But the delay should be short and it seems better to make
7319 : * checkpoint take a bit longer than to hold off insertions longer than
7320 : * necessary. (In fact, the whole reason we have this issue is that xact.c
7321 : * does commit record XLOG insertion and clog update as two separate steps
7322 : * protected by different locks, but again that seems best on grounds of
7323 : * minimizing lock contention.)
7324 : *
7325 : * A transaction that has not yet set delayChkptFlags when we look cannot
7326 : * be at risk, since it has not inserted its commit record yet; and one
7327 : * that's already cleared it is not at risk either, since it's done fixing
7328 : * clog and we will correctly flush the update below. So we cannot miss
7329 : * any xacts we need to wait for.
7330 : */
7331 2984 : vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
7332 2984 : if (nvxids > 0)
7333 : {
7334 : do
7335 : {
7336 : /*
7337 : * Keep absorbing fsync requests while we wait. There could even
7338 : * be a deadlock if we don't, if the process that prevents the
7339 : * checkpoint is trying to add a request to the queue.
7340 : */
7341 58 : AbsorbSyncRequests();
7342 :
7343 58 : pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_START);
7344 58 : pg_usleep(10000L); /* wait for 10 msec */
7345 58 : pgstat_report_wait_end();
7346 58 : } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
7347 : DELAY_CHKPT_START));
7348 : }
7349 2984 : pfree(vxids);
7350 :
7351 2984 : CheckPointGuts(checkPoint.redo, flags);
7352 :
7353 2984 : vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
7354 2984 : if (nvxids > 0)
7355 : {
7356 : do
7357 : {
7358 0 : AbsorbSyncRequests();
7359 :
7360 0 : pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_COMPLETE);
7361 0 : pg_usleep(10000L); /* wait for 10 msec */
7362 0 : pgstat_report_wait_end();
7363 0 : } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
7364 : DELAY_CHKPT_COMPLETE));
7365 : }
7366 2984 : pfree(vxids);
7367 :
7368 : /*
7369 : * Take a snapshot of running transactions and write this to WAL. This
7370 : * allows us to reconstruct the state of running transactions during
7371 : * archive recovery, if required. Skip, if this info disabled.
7372 : *
7373 : * If we are shutting down, or Startup process is completing crash
7374 : * recovery we don't need to write running xact data.
7375 : */
7376 2984 : if (!shutdown && XLogStandbyInfoActive())
7377 1696 : LogStandbySnapshot();
7378 :
7379 2984 : START_CRIT_SECTION();
7380 :
7381 : /*
7382 : * Now insert the checkpoint record into XLOG.
7383 : */
7384 2984 : XLogBeginInsert();
7385 2984 : XLogRegisterData(&checkPoint, sizeof(checkPoint));
7386 2984 : recptr = XLogInsert(RM_XLOG_ID,
7387 : shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
7388 : XLOG_CHECKPOINT_ONLINE);
7389 :
7390 2984 : XLogFlush(recptr);
7391 :
7392 : /*
7393 : * We mustn't write any new WAL after a shutdown checkpoint, or it will be
7394 : * overwritten at next startup. No-one should even try, this just allows
7395 : * sanity-checking. In the case of an end-of-recovery checkpoint, we want
7396 : * to just temporarily disable writing until the system has exited
7397 : * recovery.
7398 : */
7399 2984 : if (shutdown)
7400 : {
7401 1196 : if (flags & CHECKPOINT_END_OF_RECOVERY)
7402 58 : LocalXLogInsertAllowed = oldXLogAllowed;
7403 : else
7404 1138 : LocalXLogInsertAllowed = 0; /* never again write WAL */
7405 : }
7406 :
7407 : /*
7408 : * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
7409 : * = end of actual checkpoint record.
7410 : */
7411 2984 : if (shutdown && checkPoint.redo != ProcLastRecPtr)
7412 0 : ereport(PANIC,
7413 : (errmsg("concurrent write-ahead log activity while database system is shutting down")));
7414 :
7415 : /*
7416 : * Remember the prior checkpoint's redo ptr for
7417 : * UpdateCheckPointDistanceEstimate()
7418 : */
7419 2984 : PriorRedoPtr = ControlFile->checkPointCopy.redo;
7420 :
7421 : /*
7422 : * Update the control file.
7423 : */
7424 2984 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7425 2984 : if (shutdown)
7426 1196 : ControlFile->state = DB_SHUTDOWNED;
7427 2984 : ControlFile->checkPoint = ProcLastRecPtr;
7428 2984 : ControlFile->checkPointCopy = checkPoint;
7429 : /* crash recovery should always recover to the end of WAL */
7430 2984 : ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
7431 2984 : ControlFile->minRecoveryPointTLI = 0;
7432 :
7433 : /*
7434 : * Persist unloggedLSN value. It's reset on crash recovery, so this goes
7435 : * unused on non-shutdown checkpoints, but seems useful to store it always
7436 : * for debugging purposes.
7437 : */
7438 2984 : ControlFile->unloggedLSN = pg_atomic_read_membarrier_u64(&XLogCtl->unloggedLSN);
7439 :
7440 2984 : UpdateControlFile();
7441 2984 : LWLockRelease(ControlFileLock);
7442 :
7443 : /*
7444 : * We are now done with critical updates; no need for system panic if we
7445 : * have trouble while fooling with old log segments.
7446 : */
7447 2984 : END_CRIT_SECTION();
7448 :
7449 : /*
7450 : * WAL summaries end when the next XLOG_CHECKPOINT_REDO or
7451 : * XLOG_CHECKPOINT_SHUTDOWN record is reached. This is the first point
7452 : * where (a) we're not inside of a critical section and (b) we can be
7453 : * certain that the relevant record has been flushed to disk, which must
7454 : * happen before it can be summarized.
7455 : *
7456 : * If this is a shutdown checkpoint, then this happens reasonably
7457 : * promptly: we've only just inserted and flushed the
7458 : * XLOG_CHECKPOINT_SHUTDOWN record. If this is not a shutdown checkpoint,
7459 : * then this might not be very prompt at all: the XLOG_CHECKPOINT_REDO
7460 : * record was written before we began flushing data to disk, and that
7461 : * could be many minutes ago at this point. However, we don't XLogFlush()
7462 : * after inserting that record, so we're not guaranteed that it's on disk
7463 : * until after the above call that flushes the XLOG_CHECKPOINT_ONLINE
7464 : * record.
7465 : */
7466 2984 : WakeupWalSummarizer();
7467 :
7468 : /*
7469 : * Let smgr do post-checkpoint cleanup (eg, deleting old files).
7470 : */
7471 2984 : SyncPostCheckpoint();
7472 :
7473 : /*
7474 : * Update the average distance between checkpoints if the prior checkpoint
7475 : * exists.
7476 : */
7477 2984 : if (PriorRedoPtr != InvalidXLogRecPtr)
7478 2984 : UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
7479 :
7480 : #ifdef USE_INJECTION_POINTS
7481 2984 : INJECTION_POINT("checkpoint-before-old-wal-removal", NULL);
7482 : #endif
7483 :
7484 : /*
7485 : * Delete old log files, those no longer needed for last checkpoint to
7486 : * prevent the disk holding the xlog from growing full.
7487 : */
7488 2984 : XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7489 2984 : KeepLogSeg(recptr, &_logSegNo);
7490 2984 : if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED | RS_INVAL_IDLE_TIMEOUT,
7491 : _logSegNo, InvalidOid,
7492 : InvalidTransactionId))
7493 : {
7494 : /*
7495 : * Some slots have been invalidated; recalculate the old-segment
7496 : * horizon, starting again from RedoRecPtr.
7497 : */
7498 6 : XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7499 6 : KeepLogSeg(recptr, &_logSegNo);
7500 : }
7501 2984 : _logSegNo--;
7502 2984 : RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
7503 : checkPoint.ThisTimeLineID);
7504 :
7505 : /*
7506 : * Make more log segments if needed. (Do this after recycling old log
7507 : * segments, since that may supply some of the needed files.)
7508 : */
7509 2984 : if (!shutdown)
7510 1788 : PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
7511 :
7512 : /*
7513 : * Truncate pg_subtrans if possible. We can throw away all data before
7514 : * the oldest XMIN of any running transaction. No future transaction will
7515 : * attempt to reference any pg_subtrans entry older than that (see Asserts
7516 : * in subtrans.c). During recovery, though, we mustn't do this because
7517 : * StartupSUBTRANS hasn't been called yet.
7518 : */
7519 2984 : if (!RecoveryInProgress())
7520 2926 : TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
7521 :
7522 : /* Real work is done; log and update stats. */
7523 2984 : LogCheckpointEnd(false);
7524 :
7525 : /* Reset the process title */
7526 2984 : update_checkpoint_display(flags, false, true);
7527 :
7528 : TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
7529 : NBuffers,
7530 : CheckpointStats.ckpt_segs_added,
7531 : CheckpointStats.ckpt_segs_removed,
7532 : CheckpointStats.ckpt_segs_recycled);
7533 :
7534 2984 : return true;
7535 : }
7536 :
7537 : /*
7538 : * Mark the end of recovery in WAL though without running a full checkpoint.
7539 : * We can expect that a restartpoint is likely to be in progress as we
7540 : * do this, though we are unwilling to wait for it to complete.
7541 : *
7542 : * CreateRestartPoint() allows for the case where recovery may end before
7543 : * the restartpoint completes so there is no concern of concurrent behaviour.
7544 : */
7545 : static void
7546 84 : CreateEndOfRecoveryRecord(void)
7547 : {
7548 : xl_end_of_recovery xlrec;
7549 : XLogRecPtr recptr;
7550 :
7551 : /* sanity check */
7552 84 : if (!RecoveryInProgress())
7553 0 : elog(ERROR, "can only be used to end recovery");
7554 :
7555 84 : xlrec.end_time = GetCurrentTimestamp();
7556 84 : xlrec.wal_level = wal_level;
7557 :
7558 84 : WALInsertLockAcquireExclusive();
7559 84 : xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
7560 84 : xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
7561 84 : WALInsertLockRelease();
7562 :
7563 84 : START_CRIT_SECTION();
7564 :
7565 84 : XLogBeginInsert();
7566 84 : XLogRegisterData(&xlrec, sizeof(xl_end_of_recovery));
7567 84 : recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
7568 :
7569 84 : XLogFlush(recptr);
7570 :
7571 : /*
7572 : * Update the control file so that crash recovery can follow the timeline
7573 : * changes to this point.
7574 : */
7575 84 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7576 84 : ControlFile->minRecoveryPoint = recptr;
7577 84 : ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
7578 84 : UpdateControlFile();
7579 84 : LWLockRelease(ControlFileLock);
7580 :
7581 84 : END_CRIT_SECTION();
7582 84 : }
7583 :
7584 : /*
7585 : * Write an OVERWRITE_CONTRECORD message.
7586 : *
7587 : * When on WAL replay we expect a continuation record at the start of a page
7588 : * that is not there, recovery ends and WAL writing resumes at that point.
7589 : * But it's wrong to resume writing new WAL back at the start of the record
7590 : * that was broken, because downstream consumers of that WAL (physical
7591 : * replicas) are not prepared to "rewind". So the first action after
7592 : * finishing replay of all valid WAL must be to write a record of this type
7593 : * at the point where the contrecord was missing; to support xlogreader
7594 : * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
7595 : * to the page header where the record occurs. xlogreader has an ad-hoc
7596 : * mechanism to report metadata about the broken record, which is what we
7597 : * use here.
7598 : *
7599 : * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
7600 : * skip the record it was reading, and pass back the LSN of the skipped
7601 : * record, so that its caller can verify (on "replay" of that record) that the
7602 : * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
7603 : *
7604 : * 'aborted_lsn' is the beginning position of the record that was incomplete.
7605 : * It is included in the WAL record. 'pagePtr' and 'newTLI' point to the
7606 : * beginning of the XLOG page where the record is to be inserted. They must
7607 : * match the current WAL insert position, they're passed here just so that we
7608 : * can verify that.
7609 : */
7610 : static XLogRecPtr
7611 24 : CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
7612 : TimeLineID newTLI)
7613 : {
7614 : xl_overwrite_contrecord xlrec;
7615 : XLogRecPtr recptr;
7616 : XLogPageHeader pagehdr;
7617 : XLogRecPtr startPos;
7618 :
7619 : /* sanity checks */
7620 24 : if (!RecoveryInProgress())
7621 0 : elog(ERROR, "can only be used at end of recovery");
7622 24 : if (pagePtr % XLOG_BLCKSZ != 0)
7623 0 : elog(ERROR, "invalid position for missing continuation record %X/%08X",
7624 : LSN_FORMAT_ARGS(pagePtr));
7625 :
7626 : /* The current WAL insert position should be right after the page header */
7627 24 : startPos = pagePtr;
7628 24 : if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
7629 2 : startPos += SizeOfXLogLongPHD;
7630 : else
7631 22 : startPos += SizeOfXLogShortPHD;
7632 24 : recptr = GetXLogInsertRecPtr();
7633 24 : if (recptr != startPos)
7634 0 : elog(ERROR, "invalid WAL insert position %X/%08X for OVERWRITE_CONTRECORD",
7635 : LSN_FORMAT_ARGS(recptr));
7636 :
7637 24 : START_CRIT_SECTION();
7638 :
7639 : /*
7640 : * Initialize the XLOG page header (by GetXLogBuffer), and set the
7641 : * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
7642 : *
7643 : * No other backend is allowed to write WAL yet, so acquiring the WAL
7644 : * insertion lock is just pro forma.
7645 : */
7646 24 : WALInsertLockAcquire();
7647 24 : pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
7648 24 : pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
7649 24 : WALInsertLockRelease();
7650 :
7651 : /*
7652 : * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
7653 : * page. We know it becomes the first record, because no other backend is
7654 : * allowed to write WAL yet.
7655 : */
7656 24 : XLogBeginInsert();
7657 24 : xlrec.overwritten_lsn = aborted_lsn;
7658 24 : xlrec.overwrite_time = GetCurrentTimestamp();
7659 24 : XLogRegisterData(&xlrec, sizeof(xl_overwrite_contrecord));
7660 24 : recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
7661 :
7662 : /* check that the record was inserted to the right place */
7663 24 : if (ProcLastRecPtr != startPos)
7664 0 : elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%08X",
7665 : LSN_FORMAT_ARGS(ProcLastRecPtr));
7666 :
7667 24 : XLogFlush(recptr);
7668 :
7669 24 : END_CRIT_SECTION();
7670 :
7671 24 : return recptr;
7672 : }
7673 :
7674 : /*
7675 : * Flush all data in shared memory to disk, and fsync
7676 : *
7677 : * This is the common code shared between regular checkpoints and
7678 : * recovery restartpoints.
7679 : */
7680 : static void
7681 3362 : CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7682 : {
7683 3362 : CheckPointRelationMap();
7684 3362 : CheckPointReplicationSlots(flags & CHECKPOINT_IS_SHUTDOWN);
7685 3362 : CheckPointSnapBuild();
7686 3362 : CheckPointLogicalRewriteHeap();
7687 3362 : CheckPointReplicationOrigin();
7688 :
7689 : /* Write out all dirty data in SLRUs and the main buffer pool */
7690 : TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
7691 3362 : CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
7692 3362 : CheckPointCLOG();
7693 3362 : CheckPointCommitTs();
7694 3362 : CheckPointSUBTRANS();
7695 3362 : CheckPointMultiXact();
7696 3362 : CheckPointPredicate();
7697 3362 : CheckPointBuffers(flags);
7698 :
7699 : /* Perform all queued up fsyncs */
7700 : TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
7701 3362 : CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
7702 3362 : ProcessSyncRequests();
7703 3362 : CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
7704 : TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
7705 :
7706 : /* We deliberately delay 2PC checkpointing as long as possible */
7707 3362 : CheckPointTwoPhase(checkPointRedo);
7708 3362 : }
7709 :
7710 : /*
7711 : * Save a checkpoint for recovery restart if appropriate
7712 : *
7713 : * This function is called each time a checkpoint record is read from XLOG.
7714 : * It must determine whether the checkpoint represents a safe restartpoint or
7715 : * not. If so, the checkpoint record is stashed in shared memory so that
7716 : * CreateRestartPoint can consult it. (Note that the latter function is
7717 : * executed by the checkpointer, while this one will be executed by the
7718 : * startup process.)
7719 : */
7720 : static void
7721 1410 : RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
7722 : {
7723 : /*
7724 : * Also refrain from creating a restartpoint if we have seen any
7725 : * references to non-existent pages. Restarting recovery from the
7726 : * restartpoint would not see the references, so we would lose the
7727 : * cross-check that the pages belonged to a relation that was dropped
7728 : * later.
7729 : */
7730 1410 : if (XLogHaveInvalidPages())
7731 : {
7732 0 : elog(DEBUG2,
7733 : "could not record restart point at %X/%08X because there are unresolved references to invalid pages",
7734 : LSN_FORMAT_ARGS(checkPoint->redo));
7735 0 : return;
7736 : }
7737 :
7738 : /*
7739 : * Copy the checkpoint record to shared memory, so that checkpointer can
7740 : * work out the next time it wants to perform a restartpoint.
7741 : */
7742 1410 : SpinLockAcquire(&XLogCtl->info_lck);
7743 1410 : XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
7744 1410 : XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
7745 1410 : XLogCtl->lastCheckPoint = *checkPoint;
7746 1410 : SpinLockRelease(&XLogCtl->info_lck);
7747 : }
7748 :
7749 : /*
7750 : * Establish a restartpoint if possible.
7751 : *
7752 : * This is similar to CreateCheckPoint, but is used during WAL recovery
7753 : * to establish a point from which recovery can roll forward without
7754 : * replaying the entire recovery log.
7755 : *
7756 : * Returns true if a new restartpoint was established. We can only establish
7757 : * a restartpoint if we have replayed a safe checkpoint record since last
7758 : * restartpoint.
7759 : */
7760 : bool
7761 1172 : CreateRestartPoint(int flags)
7762 : {
7763 : XLogRecPtr lastCheckPointRecPtr;
7764 : XLogRecPtr lastCheckPointEndPtr;
7765 : CheckPoint lastCheckPoint;
7766 : XLogRecPtr PriorRedoPtr;
7767 : XLogRecPtr receivePtr;
7768 : XLogRecPtr replayPtr;
7769 : TimeLineID replayTLI;
7770 : XLogRecPtr endptr;
7771 : XLogSegNo _logSegNo;
7772 : TimestampTz xtime;
7773 :
7774 : /* Concurrent checkpoint/restartpoint cannot happen */
7775 : Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
7776 :
7777 : /* Get a local copy of the last safe checkpoint record. */
7778 1172 : SpinLockAcquire(&XLogCtl->info_lck);
7779 1172 : lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
7780 1172 : lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
7781 1172 : lastCheckPoint = XLogCtl->lastCheckPoint;
7782 1172 : SpinLockRelease(&XLogCtl->info_lck);
7783 :
7784 : /*
7785 : * Check that we're still in recovery mode. It's ok if we exit recovery
7786 : * mode after this check, the restart point is valid anyway.
7787 : */
7788 1172 : if (!RecoveryInProgress())
7789 : {
7790 0 : ereport(DEBUG2,
7791 : (errmsg_internal("skipping restartpoint, recovery has already ended")));
7792 0 : return false;
7793 : }
7794 :
7795 : /*
7796 : * If the last checkpoint record we've replayed is already our last
7797 : * restartpoint, we can't perform a new restart point. We still update
7798 : * minRecoveryPoint in that case, so that if this is a shutdown restart
7799 : * point, we won't start up earlier than before. That's not strictly
7800 : * necessary, but when hot standby is enabled, it would be rather weird if
7801 : * the database opened up for read-only connections at a point-in-time
7802 : * before the last shutdown. Such time travel is still possible in case of
7803 : * immediate shutdown, though.
7804 : *
7805 : * We don't explicitly advance minRecoveryPoint when we do create a
7806 : * restartpoint. It's assumed that flushing the buffers will do that as a
7807 : * side-effect.
7808 : */
7809 1172 : if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
7810 508 : lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
7811 : {
7812 794 : ereport(DEBUG2,
7813 : errmsg_internal("skipping restartpoint, already performed at %X/%08X",
7814 : LSN_FORMAT_ARGS(lastCheckPoint.redo)));
7815 :
7816 794 : UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
7817 794 : if (flags & CHECKPOINT_IS_SHUTDOWN)
7818 : {
7819 66 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7820 66 : ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7821 66 : UpdateControlFile();
7822 66 : LWLockRelease(ControlFileLock);
7823 : }
7824 794 : return false;
7825 : }
7826 :
7827 : /*
7828 : * Update the shared RedoRecPtr so that the startup process can calculate
7829 : * the number of segments replayed since last restartpoint, and request a
7830 : * restartpoint if it exceeds CheckPointSegments.
7831 : *
7832 : * Like in CreateCheckPoint(), hold off insertions to update it, although
7833 : * during recovery this is just pro forma, because no WAL insertions are
7834 : * happening.
7835 : */
7836 378 : WALInsertLockAcquireExclusive();
7837 378 : RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
7838 378 : WALInsertLockRelease();
7839 :
7840 : /* Also update the info_lck-protected copy */
7841 378 : SpinLockAcquire(&XLogCtl->info_lck);
7842 378 : XLogCtl->RedoRecPtr = lastCheckPoint.redo;
7843 378 : SpinLockRelease(&XLogCtl->info_lck);
7844 :
7845 : /*
7846 : * Prepare to accumulate statistics.
7847 : *
7848 : * Note: because it is possible for log_checkpoints to change while a
7849 : * checkpoint proceeds, we always accumulate stats, even if
7850 : * log_checkpoints is currently off.
7851 : */
7852 4158 : MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7853 378 : CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7854 :
7855 378 : if (log_checkpoints)
7856 378 : LogCheckpointStart(flags, true);
7857 :
7858 : /* Update the process title */
7859 378 : update_checkpoint_display(flags, true, false);
7860 :
7861 378 : CheckPointGuts(lastCheckPoint.redo, flags);
7862 :
7863 : /*
7864 : * This location needs to be after CheckPointGuts() to ensure that some
7865 : * work has already happened during this checkpoint.
7866 : */
7867 378 : INJECTION_POINT("create-restart-point", NULL);
7868 :
7869 : /*
7870 : * Remember the prior checkpoint's redo ptr for
7871 : * UpdateCheckPointDistanceEstimate()
7872 : */
7873 378 : PriorRedoPtr = ControlFile->checkPointCopy.redo;
7874 :
7875 : /*
7876 : * Update pg_control, using current time. Check that it still shows an
7877 : * older checkpoint, else do nothing; this is a quick hack to make sure
7878 : * nothing really bad happens if somehow we get here after the
7879 : * end-of-recovery checkpoint.
7880 : */
7881 378 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7882 378 : if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
7883 : {
7884 : /*
7885 : * Update the checkpoint information. We do this even if the cluster
7886 : * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL
7887 : * segments recycled below.
7888 : */
7889 378 : ControlFile->checkPoint = lastCheckPointRecPtr;
7890 378 : ControlFile->checkPointCopy = lastCheckPoint;
7891 :
7892 : /*
7893 : * Ensure minRecoveryPoint is past the checkpoint record and update it
7894 : * if the control file still shows DB_IN_ARCHIVE_RECOVERY. Normally,
7895 : * this will have happened already while writing out dirty buffers,
7896 : * but not necessarily - e.g. because no buffers were dirtied. We do
7897 : * this because a backup performed in recovery uses minRecoveryPoint
7898 : * to determine which WAL files must be included in the backup, and
7899 : * the file (or files) containing the checkpoint record must be
7900 : * included, at a minimum. Note that for an ordinary restart of
7901 : * recovery there's no value in having the minimum recovery point any
7902 : * earlier than this anyway, because redo will begin just after the
7903 : * checkpoint record.
7904 : */
7905 378 : if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
7906 : {
7907 378 : if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
7908 : {
7909 34 : ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
7910 34 : ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
7911 :
7912 : /* update local copy */
7913 34 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
7914 34 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
7915 : }
7916 378 : if (flags & CHECKPOINT_IS_SHUTDOWN)
7917 44 : ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7918 : }
7919 378 : UpdateControlFile();
7920 : }
7921 378 : LWLockRelease(ControlFileLock);
7922 :
7923 : /*
7924 : * Update the average distance between checkpoints/restartpoints if the
7925 : * prior checkpoint exists.
7926 : */
7927 378 : if (PriorRedoPtr != InvalidXLogRecPtr)
7928 378 : UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
7929 :
7930 : /*
7931 : * Delete old log files, those no longer needed for last restartpoint to
7932 : * prevent the disk holding the xlog from growing full.
7933 : */
7934 378 : XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7935 :
7936 : /*
7937 : * Retreat _logSegNo using the current end of xlog replayed or received,
7938 : * whichever is later.
7939 : */
7940 378 : receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
7941 378 : replayPtr = GetXLogReplayRecPtr(&replayTLI);
7942 378 : endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
7943 378 : KeepLogSeg(endptr, &_logSegNo);
7944 378 : if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED | RS_INVAL_IDLE_TIMEOUT,
7945 : _logSegNo, InvalidOid,
7946 : InvalidTransactionId))
7947 : {
7948 : /*
7949 : * Some slots have been invalidated; recalculate the old-segment
7950 : * horizon, starting again from RedoRecPtr.
7951 : */
7952 2 : XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7953 2 : KeepLogSeg(endptr, &_logSegNo);
7954 : }
7955 378 : _logSegNo--;
7956 :
7957 : /*
7958 : * Try to recycle segments on a useful timeline. If we've been promoted
7959 : * since the beginning of this restartpoint, use the new timeline chosen
7960 : * at end of recovery. If we're still in recovery, use the timeline we're
7961 : * currently replaying.
7962 : *
7963 : * There is no guarantee that the WAL segments will be useful on the
7964 : * current timeline; if recovery proceeds to a new timeline right after
7965 : * this, the pre-allocated WAL segments on this timeline will not be used,
7966 : * and will go wasted until recycled on the next restartpoint. We'll live
7967 : * with that.
7968 : */
7969 378 : if (!RecoveryInProgress())
7970 0 : replayTLI = XLogCtl->InsertTimeLineID;
7971 :
7972 378 : RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
7973 :
7974 : /*
7975 : * Make more log segments if needed. (Do this after recycling old log
7976 : * segments, since that may supply some of the needed files.)
7977 : */
7978 378 : PreallocXlogFiles(endptr, replayTLI);
7979 :
7980 : /*
7981 : * Truncate pg_subtrans if possible. We can throw away all data before
7982 : * the oldest XMIN of any running transaction. No future transaction will
7983 : * attempt to reference any pg_subtrans entry older than that (see Asserts
7984 : * in subtrans.c). When hot standby is disabled, though, we mustn't do
7985 : * this because StartupSUBTRANS hasn't been called yet.
7986 : */
7987 378 : if (EnableHotStandby)
7988 378 : TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
7989 :
7990 : /* Real work is done; log and update stats. */
7991 378 : LogCheckpointEnd(true);
7992 :
7993 : /* Reset the process title */
7994 378 : update_checkpoint_display(flags, true, true);
7995 :
7996 378 : xtime = GetLatestXTime();
7997 378 : ereport((log_checkpoints ? LOG : DEBUG2),
7998 : errmsg("recovery restart point at %X/%08X",
7999 : LSN_FORMAT_ARGS(lastCheckPoint.redo)),
8000 : xtime ? errdetail("Last completed transaction was at log time %s.",
8001 : timestamptz_to_str(xtime)) : 0);
8002 :
8003 : /*
8004 : * Finally, execute archive_cleanup_command, if any.
8005 : */
8006 378 : if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
8007 0 : ExecuteRecoveryCommand(archiveCleanupCommand,
8008 : "archive_cleanup_command",
8009 : false,
8010 : WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
8011 :
8012 378 : return true;
8013 : }
8014 :
8015 : /*
8016 : * Report availability of WAL for the given target LSN
8017 : * (typically a slot's restart_lsn)
8018 : *
8019 : * Returns one of the following enum values:
8020 : *
8021 : * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
8022 : * max_wal_size.
8023 : *
8024 : * * WALAVAIL_EXTENDED means it is still available by preserving extra
8025 : * segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
8026 : * than max_wal_size, this state is not returned.
8027 : *
8028 : * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
8029 : * remove reserved segments. The walsender using this slot may return to the
8030 : * above.
8031 : *
8032 : * * WALAVAIL_REMOVED means it has been removed. A replication stream on
8033 : * a slot with this LSN cannot continue. (Any associated walsender
8034 : * processes should have been terminated already.)
8035 : *
8036 : * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
8037 : */
8038 : WALAvailability
8039 960 : GetWALAvailability(XLogRecPtr targetLSN)
8040 : {
8041 : XLogRecPtr currpos; /* current write LSN */
8042 : XLogSegNo currSeg; /* segid of currpos */
8043 : XLogSegNo targetSeg; /* segid of targetLSN */
8044 : XLogSegNo oldestSeg; /* actual oldest segid */
8045 : XLogSegNo oldestSegMaxWalSize; /* oldest segid kept by max_wal_size */
8046 : XLogSegNo oldestSlotSeg; /* oldest segid kept by slot */
8047 : uint64 keepSegs;
8048 :
8049 : /*
8050 : * slot does not reserve WAL. Either deactivated, or has never been active
8051 : */
8052 960 : if (XLogRecPtrIsInvalid(targetLSN))
8053 50 : return WALAVAIL_INVALID_LSN;
8054 :
8055 : /*
8056 : * Calculate the oldest segment currently reserved by all slots,
8057 : * considering wal_keep_size and max_slot_wal_keep_size. Initialize
8058 : * oldestSlotSeg to the current segment.
8059 : */
8060 910 : currpos = GetXLogWriteRecPtr();
8061 910 : XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
8062 910 : KeepLogSeg(currpos, &oldestSlotSeg);
8063 :
8064 : /*
8065 : * Find the oldest extant segment file. We get 1 until checkpoint removes
8066 : * the first WAL segment file since startup, which causes the status being
8067 : * wrong under certain abnormal conditions but that doesn't actually harm.
8068 : */
8069 910 : oldestSeg = XLogGetLastRemovedSegno() + 1;
8070 :
8071 : /* calculate oldest segment by max_wal_size */
8072 910 : XLByteToSeg(currpos, currSeg, wal_segment_size);
8073 910 : keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
8074 :
8075 910 : if (currSeg > keepSegs)
8076 16 : oldestSegMaxWalSize = currSeg - keepSegs;
8077 : else
8078 894 : oldestSegMaxWalSize = 1;
8079 :
8080 : /* the segment we care about */
8081 910 : XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
8082 :
8083 : /*
8084 : * No point in returning reserved or extended status values if the
8085 : * targetSeg is known to be lost.
8086 : */
8087 910 : if (targetSeg >= oldestSlotSeg)
8088 : {
8089 : /* show "reserved" when targetSeg is within max_wal_size */
8090 908 : if (targetSeg >= oldestSegMaxWalSize)
8091 904 : return WALAVAIL_RESERVED;
8092 :
8093 : /* being retained by slots exceeding max_wal_size */
8094 4 : return WALAVAIL_EXTENDED;
8095 : }
8096 :
8097 : /* WAL segments are no longer retained but haven't been removed yet */
8098 2 : if (targetSeg >= oldestSeg)
8099 2 : return WALAVAIL_UNRESERVED;
8100 :
8101 : /* Definitely lost */
8102 0 : return WALAVAIL_REMOVED;
8103 : }
8104 :
8105 :
8106 : /*
8107 : * Retreat *logSegNo to the last segment that we need to retain because of
8108 : * either wal_keep_size or replication slots.
8109 : *
8110 : * This is calculated by subtracting wal_keep_size from the given xlog
8111 : * location, recptr and by making sure that that result is below the
8112 : * requirement of replication slots. For the latter criterion we do consider
8113 : * the effects of max_slot_wal_keep_size: reserve at most that much space back
8114 : * from recptr.
8115 : *
8116 : * Note about replication slots: if this function calculates a value
8117 : * that's further ahead than what slots need reserved, then affected
8118 : * slots need to be invalidated and this function invoked again.
8119 : * XXX it might be a good idea to rewrite this function so that
8120 : * invalidation is optionally done here, instead.
8121 : */
8122 : static void
8123 4280 : KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
8124 : {
8125 : XLogSegNo currSegNo;
8126 : XLogSegNo segno;
8127 : XLogRecPtr keep;
8128 :
8129 4280 : XLByteToSeg(recptr, currSegNo, wal_segment_size);
8130 4280 : segno = currSegNo;
8131 :
8132 : /* Calculate how many segments are kept by slots. */
8133 4280 : keep = XLogGetReplicationSlotMinimumLSN();
8134 4280 : if (keep != InvalidXLogRecPtr && keep < recptr)
8135 : {
8136 1188 : XLByteToSeg(keep, segno, wal_segment_size);
8137 :
8138 : /*
8139 : * Account for max_slot_wal_keep_size to avoid keeping more than
8140 : * configured. However, don't do that during a binary upgrade: if
8141 : * slots were to be invalidated because of this, it would not be
8142 : * possible to preserve logical ones during the upgrade.
8143 : */
8144 1188 : if (max_slot_wal_keep_size_mb >= 0 && !IsBinaryUpgrade)
8145 : {
8146 : uint64 slot_keep_segs;
8147 :
8148 42 : slot_keep_segs =
8149 42 : ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
8150 :
8151 42 : if (currSegNo - segno > slot_keep_segs)
8152 10 : segno = currSegNo - slot_keep_segs;
8153 : }
8154 : }
8155 :
8156 : /*
8157 : * If WAL summarization is in use, don't remove WAL that has yet to be
8158 : * summarized.
8159 : */
8160 4280 : keep = GetOldestUnsummarizedLSN(NULL, NULL);
8161 4280 : if (keep != InvalidXLogRecPtr)
8162 : {
8163 : XLogSegNo unsummarized_segno;
8164 :
8165 4 : XLByteToSeg(keep, unsummarized_segno, wal_segment_size);
8166 4 : if (unsummarized_segno < segno)
8167 4 : segno = unsummarized_segno;
8168 : }
8169 :
8170 : /* but, keep at least wal_keep_size if that's set */
8171 4280 : if (wal_keep_size_mb > 0)
8172 : {
8173 : uint64 keep_segs;
8174 :
8175 138 : keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
8176 138 : if (currSegNo - segno < keep_segs)
8177 : {
8178 : /* avoid underflow, don't go below 1 */
8179 138 : if (currSegNo <= keep_segs)
8180 130 : segno = 1;
8181 : else
8182 8 : segno = currSegNo - keep_segs;
8183 : }
8184 : }
8185 :
8186 : /* don't delete WAL segments newer than the calculated segment */
8187 4280 : if (segno < *logSegNo)
8188 702 : *logSegNo = segno;
8189 4280 : }
8190 :
8191 : /*
8192 : * Write a NEXTOID log record
8193 : */
8194 : void
8195 1190 : XLogPutNextOid(Oid nextOid)
8196 : {
8197 1190 : XLogBeginInsert();
8198 1190 : XLogRegisterData(&nextOid, sizeof(Oid));
8199 1190 : (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
8200 :
8201 : /*
8202 : * We need not flush the NEXTOID record immediately, because any of the
8203 : * just-allocated OIDs could only reach disk as part of a tuple insert or
8204 : * update that would have its own XLOG record that must follow the NEXTOID
8205 : * record. Therefore, the standard buffer LSN interlock applied to those
8206 : * records will ensure no such OID reaches disk before the NEXTOID record
8207 : * does.
8208 : *
8209 : * Note, however, that the above statement only covers state "within" the
8210 : * database. When we use a generated OID as a file or directory name, we
8211 : * are in a sense violating the basic WAL rule, because that filesystem
8212 : * change may reach disk before the NEXTOID WAL record does. The impact
8213 : * of this is that if a database crash occurs immediately afterward, we
8214 : * might after restart re-generate the same OID and find that it conflicts
8215 : * with the leftover file or directory. But since for safety's sake we
8216 : * always loop until finding a nonconflicting filename, this poses no real
8217 : * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8218 : */
8219 1190 : }
8220 :
8221 : /*
8222 : * Write an XLOG SWITCH record.
8223 : *
8224 : * Here we just blindly issue an XLogInsert request for the record.
8225 : * All the magic happens inside XLogInsert.
8226 : *
8227 : * The return value is either the end+1 address of the switch record,
8228 : * or the end+1 address of the prior segment if we did not need to
8229 : * write a switch record because we are already at segment start.
8230 : */
8231 : XLogRecPtr
8232 1546 : RequestXLogSwitch(bool mark_unimportant)
8233 : {
8234 : XLogRecPtr RecPtr;
8235 :
8236 : /* XLOG SWITCH has no data */
8237 1546 : XLogBeginInsert();
8238 :
8239 1546 : if (mark_unimportant)
8240 0 : XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
8241 1546 : RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
8242 :
8243 1546 : return RecPtr;
8244 : }
8245 :
8246 : /*
8247 : * Write a RESTORE POINT record
8248 : */
8249 : XLogRecPtr
8250 6 : XLogRestorePoint(const char *rpName)
8251 : {
8252 : XLogRecPtr RecPtr;
8253 : xl_restore_point xlrec;
8254 :
8255 6 : xlrec.rp_time = GetCurrentTimestamp();
8256 6 : strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
8257 :
8258 6 : XLogBeginInsert();
8259 6 : XLogRegisterData(&xlrec, sizeof(xl_restore_point));
8260 :
8261 6 : RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
8262 :
8263 6 : ereport(LOG,
8264 : errmsg("restore point \"%s\" created at %X/%08X",
8265 : rpName, LSN_FORMAT_ARGS(RecPtr)));
8266 :
8267 6 : return RecPtr;
8268 : }
8269 :
8270 : /*
8271 : * Check if any of the GUC parameters that are critical for hot standby
8272 : * have changed, and update the value in pg_control file if necessary.
8273 : */
8274 : static void
8275 1726 : XLogReportParameters(void)
8276 : {
8277 1726 : if (wal_level != ControlFile->wal_level ||
8278 1266 : wal_log_hints != ControlFile->wal_log_hints ||
8279 1100 : MaxConnections != ControlFile->MaxConnections ||
8280 1098 : max_worker_processes != ControlFile->max_worker_processes ||
8281 1096 : max_wal_senders != ControlFile->max_wal_senders ||
8282 1054 : max_prepared_xacts != ControlFile->max_prepared_xacts ||
8283 870 : max_locks_per_xact != ControlFile->max_locks_per_xact ||
8284 870 : track_commit_timestamp != ControlFile->track_commit_timestamp)
8285 : {
8286 : /*
8287 : * The change in number of backend slots doesn't need to be WAL-logged
8288 : * if archiving is not enabled, as you can't start archive recovery
8289 : * with wal_level=minimal anyway. We don't really care about the
8290 : * values in pg_control either if wal_level=minimal, but seems better
8291 : * to keep them up-to-date to avoid confusion.
8292 : */
8293 880 : if (wal_level != ControlFile->wal_level || XLogIsNeeded())
8294 : {
8295 : xl_parameter_change xlrec;
8296 : XLogRecPtr recptr;
8297 :
8298 840 : xlrec.MaxConnections = MaxConnections;
8299 840 : xlrec.max_worker_processes = max_worker_processes;
8300 840 : xlrec.max_wal_senders = max_wal_senders;
8301 840 : xlrec.max_prepared_xacts = max_prepared_xacts;
8302 840 : xlrec.max_locks_per_xact = max_locks_per_xact;
8303 840 : xlrec.wal_level = wal_level;
8304 840 : xlrec.wal_log_hints = wal_log_hints;
8305 840 : xlrec.track_commit_timestamp = track_commit_timestamp;
8306 :
8307 840 : XLogBeginInsert();
8308 840 : XLogRegisterData(&xlrec, sizeof(xlrec));
8309 :
8310 840 : recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
8311 840 : XLogFlush(recptr);
8312 : }
8313 :
8314 880 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8315 :
8316 880 : ControlFile->MaxConnections = MaxConnections;
8317 880 : ControlFile->max_worker_processes = max_worker_processes;
8318 880 : ControlFile->max_wal_senders = max_wal_senders;
8319 880 : ControlFile->max_prepared_xacts = max_prepared_xacts;
8320 880 : ControlFile->max_locks_per_xact = max_locks_per_xact;
8321 880 : ControlFile->wal_level = wal_level;
8322 880 : ControlFile->wal_log_hints = wal_log_hints;
8323 880 : ControlFile->track_commit_timestamp = track_commit_timestamp;
8324 880 : UpdateControlFile();
8325 :
8326 880 : LWLockRelease(ControlFileLock);
8327 : }
8328 1726 : }
8329 :
8330 : /*
8331 : * Update full_page_writes in shared memory, and write an
8332 : * XLOG_FPW_CHANGE record if necessary.
8333 : *
8334 : * Note: this function assumes there is no other process running
8335 : * concurrently that could update it.
8336 : */
8337 : void
8338 2882 : UpdateFullPageWrites(void)
8339 : {
8340 2882 : XLogCtlInsert *Insert = &XLogCtl->Insert;
8341 : bool recoveryInProgress;
8342 :
8343 : /*
8344 : * Do nothing if full_page_writes has not been changed.
8345 : *
8346 : * It's safe to check the shared full_page_writes without the lock,
8347 : * because we assume that there is no concurrently running process which
8348 : * can update it.
8349 : */
8350 2882 : if (fullPageWrites == Insert->fullPageWrites)
8351 2156 : return;
8352 :
8353 : /*
8354 : * Perform this outside critical section so that the WAL insert
8355 : * initialization done by RecoveryInProgress() doesn't trigger an
8356 : * assertion failure.
8357 : */
8358 726 : recoveryInProgress = RecoveryInProgress();
8359 :
8360 726 : START_CRIT_SECTION();
8361 :
8362 : /*
8363 : * It's always safe to take full page images, even when not strictly
8364 : * required, but not the other round. So if we're setting full_page_writes
8365 : * to true, first set it true and then write the WAL record. If we're
8366 : * setting it to false, first write the WAL record and then set the global
8367 : * flag.
8368 : */
8369 726 : if (fullPageWrites)
8370 : {
8371 706 : WALInsertLockAcquireExclusive();
8372 706 : Insert->fullPageWrites = true;
8373 706 : WALInsertLockRelease();
8374 : }
8375 :
8376 : /*
8377 : * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
8378 : * full_page_writes during archive recovery, if required.
8379 : */
8380 726 : if (XLogStandbyInfoActive() && !recoveryInProgress)
8381 : {
8382 0 : XLogBeginInsert();
8383 0 : XLogRegisterData(&fullPageWrites, sizeof(bool));
8384 :
8385 0 : XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
8386 : }
8387 :
8388 726 : if (!fullPageWrites)
8389 : {
8390 20 : WALInsertLockAcquireExclusive();
8391 20 : Insert->fullPageWrites = false;
8392 20 : WALInsertLockRelease();
8393 : }
8394 726 : END_CRIT_SECTION();
8395 : }
8396 :
8397 : /*
8398 : * XLOG resource manager's routines
8399 : *
8400 : * Definitions of info values are in include/catalog/pg_control.h, though
8401 : * not all record types are related to control file updates.
8402 : *
8403 : * NOTE: Some XLOG record types that are directly related to WAL recovery
8404 : * are handled in xlogrecovery_redo().
8405 : */
8406 : void
8407 86220 : xlog_redo(XLogReaderState *record)
8408 : {
8409 86220 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8410 86220 : XLogRecPtr lsn = record->EndRecPtr;
8411 :
8412 : /*
8413 : * In XLOG rmgr, backup blocks are only used by XLOG_FPI and
8414 : * XLOG_FPI_FOR_HINT records.
8415 : */
8416 : Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
8417 : !XLogRecHasAnyBlockRefs(record));
8418 :
8419 86220 : if (info == XLOG_NEXTOID)
8420 : {
8421 : Oid nextOid;
8422 :
8423 : /*
8424 : * We used to try to take the maximum of TransamVariables->nextOid and
8425 : * the recorded nextOid, but that fails if the OID counter wraps
8426 : * around. Since no OID allocation should be happening during replay
8427 : * anyway, better to just believe the record exactly. We still take
8428 : * OidGenLock while setting the variable, just in case.
8429 : */
8430 186 : memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
8431 186 : LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8432 186 : TransamVariables->nextOid = nextOid;
8433 186 : TransamVariables->oidCount = 0;
8434 186 : LWLockRelease(OidGenLock);
8435 : }
8436 86034 : else if (info == XLOG_CHECKPOINT_SHUTDOWN)
8437 : {
8438 : CheckPoint checkPoint;
8439 : TimeLineID replayTLI;
8440 :
8441 68 : memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8442 : /* In a SHUTDOWN checkpoint, believe the counters exactly */
8443 68 : LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8444 68 : TransamVariables->nextXid = checkPoint.nextXid;
8445 68 : LWLockRelease(XidGenLock);
8446 68 : LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8447 68 : TransamVariables->nextOid = checkPoint.nextOid;
8448 68 : TransamVariables->oidCount = 0;
8449 68 : LWLockRelease(OidGenLock);
8450 68 : MultiXactSetNextMXact(checkPoint.nextMulti,
8451 : checkPoint.nextMultiOffset);
8452 :
8453 68 : MultiXactAdvanceOldest(checkPoint.oldestMulti,
8454 : checkPoint.oldestMultiDB);
8455 :
8456 : /*
8457 : * No need to set oldestClogXid here as well; it'll be set when we
8458 : * redo an xl_clog_truncate if it changed since initialization.
8459 : */
8460 68 : SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
8461 :
8462 : /*
8463 : * If we see a shutdown checkpoint while waiting for an end-of-backup
8464 : * record, the backup was canceled and the end-of-backup record will
8465 : * never arrive.
8466 : */
8467 68 : if (ArchiveRecoveryRequested &&
8468 66 : !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
8469 0 : XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
8470 0 : ereport(PANIC,
8471 : (errmsg("online backup was canceled, recovery cannot continue")));
8472 :
8473 : /*
8474 : * If we see a shutdown checkpoint, we know that nothing was running
8475 : * on the primary at this point. So fake-up an empty running-xacts
8476 : * record and use that here and now. Recover additional standby state
8477 : * for prepared transactions.
8478 : */
8479 68 : if (standbyState >= STANDBY_INITIALIZED)
8480 : {
8481 : TransactionId *xids;
8482 : int nxids;
8483 : TransactionId oldestActiveXID;
8484 : TransactionId latestCompletedXid;
8485 : RunningTransactionsData running;
8486 :
8487 62 : oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
8488 :
8489 : /* Update pg_subtrans entries for any prepared transactions */
8490 62 : StandbyRecoverPreparedTransactions();
8491 :
8492 : /*
8493 : * Construct a RunningTransactions snapshot representing a shut
8494 : * down server, with only prepared transactions still alive. We're
8495 : * never overflowed at this point because all subxids are listed
8496 : * with their parent prepared transactions.
8497 : */
8498 62 : running.xcnt = nxids;
8499 62 : running.subxcnt = 0;
8500 62 : running.subxid_status = SUBXIDS_IN_SUBTRANS;
8501 62 : running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
8502 62 : running.oldestRunningXid = oldestActiveXID;
8503 62 : latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
8504 62 : TransactionIdRetreat(latestCompletedXid);
8505 : Assert(TransactionIdIsNormal(latestCompletedXid));
8506 62 : running.latestCompletedXid = latestCompletedXid;
8507 62 : running.xids = xids;
8508 :
8509 62 : ProcArrayApplyRecoveryInfo(&running);
8510 : }
8511 :
8512 : /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8513 68 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8514 68 : ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8515 68 : LWLockRelease(ControlFileLock);
8516 :
8517 : /*
8518 : * We should've already switched to the new TLI before replaying this
8519 : * record.
8520 : */
8521 68 : (void) GetCurrentReplayRecPtr(&replayTLI);
8522 68 : if (checkPoint.ThisTimeLineID != replayTLI)
8523 0 : ereport(PANIC,
8524 : (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
8525 : checkPoint.ThisTimeLineID, replayTLI)));
8526 :
8527 68 : RecoveryRestartPoint(&checkPoint, record);
8528 : }
8529 85966 : else if (info == XLOG_CHECKPOINT_ONLINE)
8530 : {
8531 : CheckPoint checkPoint;
8532 : TimeLineID replayTLI;
8533 :
8534 1342 : memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8535 : /* In an ONLINE checkpoint, treat the XID counter as a minimum */
8536 1342 : LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8537 1342 : if (FullTransactionIdPrecedes(TransamVariables->nextXid,
8538 : checkPoint.nextXid))
8539 0 : TransamVariables->nextXid = checkPoint.nextXid;
8540 1342 : LWLockRelease(XidGenLock);
8541 :
8542 : /*
8543 : * We ignore the nextOid counter in an ONLINE checkpoint, preferring
8544 : * to track OID assignment through XLOG_NEXTOID records. The nextOid
8545 : * counter is from the start of the checkpoint and might well be stale
8546 : * compared to later XLOG_NEXTOID records. We could try to take the
8547 : * maximum of the nextOid counter and our latest value, but since
8548 : * there's no particular guarantee about the speed with which the OID
8549 : * counter wraps around, that's a risky thing to do. In any case,
8550 : * users of the nextOid counter are required to avoid assignment of
8551 : * duplicates, so that a somewhat out-of-date value should be safe.
8552 : */
8553 :
8554 : /* Handle multixact */
8555 1342 : MultiXactAdvanceNextMXact(checkPoint.nextMulti,
8556 : checkPoint.nextMultiOffset);
8557 :
8558 : /*
8559 : * NB: This may perform multixact truncation when replaying WAL
8560 : * generated by an older primary.
8561 : */
8562 1342 : MultiXactAdvanceOldest(checkPoint.oldestMulti,
8563 : checkPoint.oldestMultiDB);
8564 1342 : if (TransactionIdPrecedes(TransamVariables->oldestXid,
8565 : checkPoint.oldestXid))
8566 0 : SetTransactionIdLimit(checkPoint.oldestXid,
8567 : checkPoint.oldestXidDB);
8568 : /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8569 1342 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8570 1342 : ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8571 1342 : LWLockRelease(ControlFileLock);
8572 :
8573 : /* TLI should not change in an on-line checkpoint */
8574 1342 : (void) GetCurrentReplayRecPtr(&replayTLI);
8575 1342 : if (checkPoint.ThisTimeLineID != replayTLI)
8576 0 : ereport(PANIC,
8577 : (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
8578 : checkPoint.ThisTimeLineID, replayTLI)));
8579 :
8580 1342 : RecoveryRestartPoint(&checkPoint, record);
8581 : }
8582 84624 : else if (info == XLOG_OVERWRITE_CONTRECORD)
8583 : {
8584 : /* nothing to do here, handled in xlogrecovery_redo() */
8585 : }
8586 84622 : else if (info == XLOG_END_OF_RECOVERY)
8587 : {
8588 : xl_end_of_recovery xlrec;
8589 : TimeLineID replayTLI;
8590 :
8591 20 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
8592 :
8593 : /*
8594 : * For Hot Standby, we could treat this like a Shutdown Checkpoint,
8595 : * but this case is rarer and harder to test, so the benefit doesn't
8596 : * outweigh the potential extra cost of maintenance.
8597 : */
8598 :
8599 : /*
8600 : * We should've already switched to the new TLI before replaying this
8601 : * record.
8602 : */
8603 20 : (void) GetCurrentReplayRecPtr(&replayTLI);
8604 20 : if (xlrec.ThisTimeLineID != replayTLI)
8605 0 : ereport(PANIC,
8606 : (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
8607 : xlrec.ThisTimeLineID, replayTLI)));
8608 : }
8609 84602 : else if (info == XLOG_NOOP)
8610 : {
8611 : /* nothing to do here */
8612 : }
8613 84602 : else if (info == XLOG_SWITCH)
8614 : {
8615 : /* nothing to do here */
8616 : }
8617 83718 : else if (info == XLOG_RESTORE_POINT)
8618 : {
8619 : /* nothing to do here, handled in xlogrecovery.c */
8620 : }
8621 83708 : else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
8622 : {
8623 : /*
8624 : * XLOG_FPI records contain nothing else but one or more block
8625 : * references. Every block reference must include a full-page image
8626 : * even if full_page_writes was disabled when the record was generated
8627 : * - otherwise there would be no point in this record.
8628 : *
8629 : * XLOG_FPI_FOR_HINT records are generated when a page needs to be
8630 : * WAL-logged because of a hint bit update. They are only generated
8631 : * when checksums and/or wal_log_hints are enabled. They may include
8632 : * no full-page images if full_page_writes was disabled when they were
8633 : * generated. In this case there is nothing to do here.
8634 : *
8635 : * No recovery conflicts are generated by these generic records - if a
8636 : * resource manager needs to generate conflicts, it has to define a
8637 : * separate WAL record type and redo routine.
8638 : */
8639 173882 : for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
8640 : {
8641 : Buffer buffer;
8642 :
8643 91754 : if (!XLogRecHasBlockImage(record, block_id))
8644 : {
8645 132 : if (info == XLOG_FPI)
8646 0 : elog(ERROR, "XLOG_FPI record did not contain a full-page image");
8647 132 : continue;
8648 : }
8649 :
8650 91622 : if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
8651 0 : elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
8652 91622 : UnlockReleaseBuffer(buffer);
8653 : }
8654 : }
8655 1580 : else if (info == XLOG_BACKUP_END)
8656 : {
8657 : /* nothing to do here, handled in xlogrecovery_redo() */
8658 : }
8659 1410 : else if (info == XLOG_PARAMETER_CHANGE)
8660 : {
8661 : xl_parameter_change xlrec;
8662 :
8663 : /* Update our copy of the parameters in pg_control */
8664 66 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
8665 :
8666 : /*
8667 : * Invalidate logical slots if we are in hot standby and the primary
8668 : * does not have a WAL level sufficient for logical decoding. No need
8669 : * to search for potentially conflicting logically slots if standby is
8670 : * running with wal_level lower than logical, because in that case, we
8671 : * would have either disallowed creation of logical slots or
8672 : * invalidated existing ones.
8673 : */
8674 66 : if (InRecovery && InHotStandby &&
8675 36 : xlrec.wal_level < WAL_LEVEL_LOGICAL &&
8676 14 : wal_level >= WAL_LEVEL_LOGICAL)
8677 10 : InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_LEVEL,
8678 : 0, InvalidOid,
8679 : InvalidTransactionId);
8680 :
8681 66 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8682 66 : ControlFile->MaxConnections = xlrec.MaxConnections;
8683 66 : ControlFile->max_worker_processes = xlrec.max_worker_processes;
8684 66 : ControlFile->max_wal_senders = xlrec.max_wal_senders;
8685 66 : ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
8686 66 : ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
8687 66 : ControlFile->wal_level = xlrec.wal_level;
8688 66 : ControlFile->wal_log_hints = xlrec.wal_log_hints;
8689 :
8690 : /*
8691 : * Update minRecoveryPoint to ensure that if recovery is aborted, we
8692 : * recover back up to this point before allowing hot standby again.
8693 : * This is important if the max_* settings are decreased, to ensure
8694 : * you don't run queries against the WAL preceding the change. The
8695 : * local copies cannot be updated as long as crash recovery is
8696 : * happening and we expect all the WAL to be replayed.
8697 : */
8698 66 : if (InArchiveRecovery)
8699 : {
8700 38 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
8701 38 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
8702 : }
8703 66 : if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
8704 : {
8705 : TimeLineID replayTLI;
8706 :
8707 14 : (void) GetCurrentReplayRecPtr(&replayTLI);
8708 14 : ControlFile->minRecoveryPoint = lsn;
8709 14 : ControlFile->minRecoveryPointTLI = replayTLI;
8710 : }
8711 :
8712 66 : CommitTsParameterChange(xlrec.track_commit_timestamp,
8713 66 : ControlFile->track_commit_timestamp);
8714 66 : ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
8715 :
8716 66 : UpdateControlFile();
8717 66 : LWLockRelease(ControlFileLock);
8718 :
8719 : /* Check to see if any parameter change gives a problem on recovery */
8720 66 : CheckRequiredParameterValues();
8721 : }
8722 1344 : else if (info == XLOG_FPW_CHANGE)
8723 : {
8724 : bool fpw;
8725 :
8726 0 : memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
8727 :
8728 : /*
8729 : * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
8730 : * do_pg_backup_start() and do_pg_backup_stop() can check whether
8731 : * full_page_writes has been disabled during online backup.
8732 : */
8733 0 : if (!fpw)
8734 : {
8735 0 : SpinLockAcquire(&XLogCtl->info_lck);
8736 0 : if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
8737 0 : XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
8738 0 : SpinLockRelease(&XLogCtl->info_lck);
8739 : }
8740 :
8741 : /* Keep track of full_page_writes */
8742 0 : lastFullPageWrites = fpw;
8743 : }
8744 : else if (info == XLOG_CHECKPOINT_REDO)
8745 : {
8746 : /* nothing to do here, just for informational purposes */
8747 : }
8748 86216 : }
8749 :
8750 : /*
8751 : * Return the extra open flags used for opening a file, depending on the
8752 : * value of the GUCs wal_sync_method, fsync and debug_io_direct.
8753 : */
8754 : static int
8755 31196 : get_sync_bit(int method)
8756 : {
8757 31196 : int o_direct_flag = 0;
8758 :
8759 : /*
8760 : * Use O_DIRECT if requested, except in walreceiver process. The WAL
8761 : * written by walreceiver is normally read by the startup process soon
8762 : * after it's written. Also, walreceiver performs unaligned writes, which
8763 : * don't work with O_DIRECT, so it is required for correctness too.
8764 : */
8765 31196 : if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
8766 18 : o_direct_flag = PG_O_DIRECT;
8767 :
8768 : /* If fsync is disabled, never open in sync mode */
8769 31196 : if (!enableFsync)
8770 31196 : return o_direct_flag;
8771 :
8772 0 : switch (method)
8773 : {
8774 : /*
8775 : * enum values for all sync options are defined even if they are
8776 : * not supported on the current platform. But if not, they are
8777 : * not included in the enum option array, and therefore will never
8778 : * be seen here.
8779 : */
8780 0 : case WAL_SYNC_METHOD_FSYNC:
8781 : case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
8782 : case WAL_SYNC_METHOD_FDATASYNC:
8783 0 : return o_direct_flag;
8784 : #ifdef O_SYNC
8785 0 : case WAL_SYNC_METHOD_OPEN:
8786 0 : return O_SYNC | o_direct_flag;
8787 : #endif
8788 : #ifdef O_DSYNC
8789 0 : case WAL_SYNC_METHOD_OPEN_DSYNC:
8790 0 : return O_DSYNC | o_direct_flag;
8791 : #endif
8792 0 : default:
8793 : /* can't happen (unless we are out of sync with option array) */
8794 0 : elog(ERROR, "unrecognized \"wal_sync_method\": %d", method);
8795 : return 0; /* silence warning */
8796 : }
8797 : }
8798 :
8799 : /*
8800 : * GUC support
8801 : */
8802 : void
8803 2204 : assign_wal_sync_method(int new_wal_sync_method, void *extra)
8804 : {
8805 2204 : if (wal_sync_method != new_wal_sync_method)
8806 : {
8807 : /*
8808 : * To ensure that no blocks escape unsynced, force an fsync on the
8809 : * currently open log segment (if any). Also, if the open flag is
8810 : * changing, close the log file so it will be reopened (with new flag
8811 : * bit) at next use.
8812 : */
8813 0 : if (openLogFile >= 0)
8814 : {
8815 0 : pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
8816 0 : if (pg_fsync(openLogFile) != 0)
8817 : {
8818 : char xlogfname[MAXFNAMELEN];
8819 : int save_errno;
8820 :
8821 0 : save_errno = errno;
8822 0 : XLogFileName(xlogfname, openLogTLI, openLogSegNo,
8823 : wal_segment_size);
8824 0 : errno = save_errno;
8825 0 : ereport(PANIC,
8826 : (errcode_for_file_access(),
8827 : errmsg("could not fsync file \"%s\": %m", xlogfname)));
8828 : }
8829 :
8830 0 : pgstat_report_wait_end();
8831 0 : if (get_sync_bit(wal_sync_method) != get_sync_bit(new_wal_sync_method))
8832 0 : XLogFileClose();
8833 : }
8834 : }
8835 2204 : }
8836 :
8837 :
8838 : /*
8839 : * Issue appropriate kind of fsync (if any) for an XLOG output file.
8840 : *
8841 : * 'fd' is a file descriptor for the XLOG file to be fsync'd.
8842 : * 'segno' is for error reporting purposes.
8843 : */
8844 : void
8845 306632 : issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
8846 : {
8847 306632 : char *msg = NULL;
8848 : instr_time start;
8849 :
8850 : Assert(tli != 0);
8851 :
8852 : /*
8853 : * Quick exit if fsync is disabled or write() has already synced the WAL
8854 : * file.
8855 : */
8856 306632 : if (!enableFsync ||
8857 0 : wal_sync_method == WAL_SYNC_METHOD_OPEN ||
8858 0 : wal_sync_method == WAL_SYNC_METHOD_OPEN_DSYNC)
8859 306632 : return;
8860 :
8861 : /*
8862 : * Measure I/O timing to sync the WAL file for pg_stat_io.
8863 : */
8864 0 : start = pgstat_prepare_io_time(track_wal_io_timing);
8865 :
8866 0 : pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
8867 0 : switch (wal_sync_method)
8868 : {
8869 0 : case WAL_SYNC_METHOD_FSYNC:
8870 0 : if (pg_fsync_no_writethrough(fd) != 0)
8871 0 : msg = _("could not fsync file \"%s\": %m");
8872 0 : break;
8873 : #ifdef HAVE_FSYNC_WRITETHROUGH
8874 : case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
8875 : if (pg_fsync_writethrough(fd) != 0)
8876 : msg = _("could not fsync write-through file \"%s\": %m");
8877 : break;
8878 : #endif
8879 0 : case WAL_SYNC_METHOD_FDATASYNC:
8880 0 : if (pg_fdatasync(fd) != 0)
8881 0 : msg = _("could not fdatasync file \"%s\": %m");
8882 0 : break;
8883 0 : case WAL_SYNC_METHOD_OPEN:
8884 : case WAL_SYNC_METHOD_OPEN_DSYNC:
8885 : /* not reachable */
8886 : Assert(false);
8887 0 : break;
8888 0 : default:
8889 0 : ereport(PANIC,
8890 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8891 : errmsg_internal("unrecognized \"wal_sync_method\": %d", wal_sync_method));
8892 : break;
8893 : }
8894 :
8895 : /* PANIC if failed to fsync */
8896 0 : if (msg)
8897 : {
8898 : char xlogfname[MAXFNAMELEN];
8899 0 : int save_errno = errno;
8900 :
8901 0 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
8902 0 : errno = save_errno;
8903 0 : ereport(PANIC,
8904 : (errcode_for_file_access(),
8905 : errmsg(msg, xlogfname)));
8906 : }
8907 :
8908 0 : pgstat_report_wait_end();
8909 :
8910 0 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC,
8911 : start, 1, 0);
8912 : }
8913 :
8914 : /*
8915 : * do_pg_backup_start is the workhorse of the user-visible pg_backup_start()
8916 : * function. It creates the necessary starting checkpoint and constructs the
8917 : * backup state and tablespace map.
8918 : *
8919 : * Input parameters are "state" (the backup state), "fast" (if true, we do
8920 : * the checkpoint in fast mode), and "tablespaces" (if non-NULL, indicates a
8921 : * list of tablespaceinfo structs describing the cluster's tablespaces.).
8922 : *
8923 : * The tablespace map contents are appended to passed-in parameter
8924 : * tablespace_map and the caller is responsible for including it in the backup
8925 : * archive as 'tablespace_map'. The tablespace_map file is required mainly for
8926 : * tar format in windows as native windows utilities are not able to create
8927 : * symlinks while extracting files from tar. However for consistency and
8928 : * platform-independence, we do it the same way everywhere.
8929 : *
8930 : * It fills in "state" with the information required for the backup, such
8931 : * as the minimum WAL location that must be present to restore from this
8932 : * backup (starttli) and the corresponding timeline ID (starttli).
8933 : *
8934 : * Every successfully started backup must be stopped by calling
8935 : * do_pg_backup_stop() or do_pg_abort_backup(). There can be many
8936 : * backups active at the same time.
8937 : *
8938 : * It is the responsibility of the caller of this function to verify the
8939 : * permissions of the calling user!
8940 : */
8941 : void
8942 322 : do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces,
8943 : BackupState *state, StringInfo tblspcmapfile)
8944 : {
8945 : bool backup_started_in_recovery;
8946 :
8947 : Assert(state != NULL);
8948 322 : backup_started_in_recovery = RecoveryInProgress();
8949 :
8950 : /*
8951 : * During recovery, we don't need to check WAL level. Because, if WAL
8952 : * level is not sufficient, it's impossible to get here during recovery.
8953 : */
8954 322 : if (!backup_started_in_recovery && !XLogIsNeeded())
8955 0 : ereport(ERROR,
8956 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8957 : errmsg("WAL level not sufficient for making an online backup"),
8958 : errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
8959 :
8960 322 : if (strlen(backupidstr) > MAXPGPATH)
8961 2 : ereport(ERROR,
8962 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8963 : errmsg("backup label too long (max %d bytes)",
8964 : MAXPGPATH)));
8965 :
8966 320 : strlcpy(state->name, backupidstr, sizeof(state->name));
8967 :
8968 : /*
8969 : * Mark backup active in shared memory. We must do full-page WAL writes
8970 : * during an on-line backup even if not doing so at other times, because
8971 : * it's quite possible for the backup dump to obtain a "torn" (partially
8972 : * written) copy of a database page if it reads the page concurrently with
8973 : * our write to the same page. This can be fixed as long as the first
8974 : * write to the page in the WAL sequence is a full-page write. Hence, we
8975 : * increment runningBackups then force a CHECKPOINT, to ensure there are
8976 : * no dirty pages in shared memory that might get dumped while the backup
8977 : * is in progress without having a corresponding WAL record. (Once the
8978 : * backup is complete, we need not force full-page writes anymore, since
8979 : * we expect that any pages not modified during the backup interval must
8980 : * have been correctly captured by the backup.)
8981 : *
8982 : * Note that forcing full-page writes has no effect during an online
8983 : * backup from the standby.
8984 : *
8985 : * We must hold all the insertion locks to change the value of
8986 : * runningBackups, to ensure adequate interlocking against
8987 : * XLogInsertRecord().
8988 : */
8989 320 : WALInsertLockAcquireExclusive();
8990 320 : XLogCtl->Insert.runningBackups++;
8991 320 : WALInsertLockRelease();
8992 :
8993 : /*
8994 : * Ensure we decrement runningBackups if we fail below. NB -- for this to
8995 : * work correctly, it is critical that sessionBackupState is only updated
8996 : * after this block is over.
8997 : */
8998 320 : PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(true));
8999 : {
9000 320 : bool gotUniqueStartpoint = false;
9001 : DIR *tblspcdir;
9002 : struct dirent *de;
9003 : tablespaceinfo *ti;
9004 : int datadirpathlen;
9005 :
9006 : /*
9007 : * Force an XLOG file switch before the checkpoint, to ensure that the
9008 : * WAL segment the checkpoint is written to doesn't contain pages with
9009 : * old timeline IDs. That would otherwise happen if you called
9010 : * pg_backup_start() right after restoring from a PITR archive: the
9011 : * first WAL segment containing the startup checkpoint has pages in
9012 : * the beginning with the old timeline ID. That can cause trouble at
9013 : * recovery: we won't have a history file covering the old timeline if
9014 : * pg_wal directory was not included in the base backup and the WAL
9015 : * archive was cleared too before starting the backup.
9016 : *
9017 : * This also ensures that we have emitted a WAL page header that has
9018 : * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9019 : * Therefore, if a WAL archiver (such as pglesslog) is trying to
9020 : * compress out removable backup blocks, it won't remove any that
9021 : * occur after this point.
9022 : *
9023 : * During recovery, we skip forcing XLOG file switch, which means that
9024 : * the backup taken during recovery is not available for the special
9025 : * recovery case described above.
9026 : */
9027 320 : if (!backup_started_in_recovery)
9028 306 : RequestXLogSwitch(false);
9029 :
9030 : do
9031 : {
9032 : bool checkpointfpw;
9033 :
9034 : /*
9035 : * Force a CHECKPOINT. Aside from being necessary to prevent torn
9036 : * page problems, this guarantees that two successive backup runs
9037 : * will have different checkpoint positions and hence different
9038 : * history file names, even if nothing happened in between.
9039 : *
9040 : * During recovery, establish a restartpoint if possible. We use
9041 : * the last restartpoint as the backup starting checkpoint. This
9042 : * means that two successive backup runs can have same checkpoint
9043 : * positions.
9044 : *
9045 : * Since the fact that we are executing do_pg_backup_start()
9046 : * during recovery means that checkpointer is running, we can use
9047 : * RequestCheckpoint() to establish a restartpoint.
9048 : *
9049 : * We use CHECKPOINT_FAST only if requested by user (via passing
9050 : * fast = true). Otherwise this can take awhile.
9051 : */
9052 320 : RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
9053 : (fast ? CHECKPOINT_FAST : 0));
9054 :
9055 : /*
9056 : * Now we need to fetch the checkpoint record location, and also
9057 : * its REDO pointer. The oldest point in WAL that would be needed
9058 : * to restore starting from the checkpoint is precisely the REDO
9059 : * pointer.
9060 : */
9061 320 : LWLockAcquire(ControlFileLock, LW_SHARED);
9062 320 : state->checkpointloc = ControlFile->checkPoint;
9063 320 : state->startpoint = ControlFile->checkPointCopy.redo;
9064 320 : state->starttli = ControlFile->checkPointCopy.ThisTimeLineID;
9065 320 : checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
9066 320 : LWLockRelease(ControlFileLock);
9067 :
9068 320 : if (backup_started_in_recovery)
9069 : {
9070 : XLogRecPtr recptr;
9071 :
9072 : /*
9073 : * Check to see if all WAL replayed during online backup
9074 : * (i.e., since last restartpoint used as backup starting
9075 : * checkpoint) contain full-page writes.
9076 : */
9077 14 : SpinLockAcquire(&XLogCtl->info_lck);
9078 14 : recptr = XLogCtl->lastFpwDisableRecPtr;
9079 14 : SpinLockRelease(&XLogCtl->info_lck);
9080 :
9081 14 : if (!checkpointfpw || state->startpoint <= recptr)
9082 0 : ereport(ERROR,
9083 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9084 : errmsg("WAL generated with \"full_page_writes=off\" was replayed "
9085 : "since last restartpoint"),
9086 : errhint("This means that the backup being taken on the standby "
9087 : "is corrupt and should not be used. "
9088 : "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
9089 : "and then try an online backup again.")));
9090 :
9091 : /*
9092 : * During recovery, since we don't use the end-of-backup WAL
9093 : * record and don't write the backup history file, the
9094 : * starting WAL location doesn't need to be unique. This means
9095 : * that two base backups started at the same time might use
9096 : * the same checkpoint as starting locations.
9097 : */
9098 14 : gotUniqueStartpoint = true;
9099 : }
9100 :
9101 : /*
9102 : * If two base backups are started at the same time (in WAL sender
9103 : * processes), we need to make sure that they use different
9104 : * checkpoints as starting locations, because we use the starting
9105 : * WAL location as a unique identifier for the base backup in the
9106 : * end-of-backup WAL record and when we write the backup history
9107 : * file. Perhaps it would be better generate a separate unique ID
9108 : * for each backup instead of forcing another checkpoint, but
9109 : * taking a checkpoint right after another is not that expensive
9110 : * either because only few buffers have been dirtied yet.
9111 : */
9112 320 : WALInsertLockAcquireExclusive();
9113 320 : if (XLogCtl->Insert.lastBackupStart < state->startpoint)
9114 : {
9115 320 : XLogCtl->Insert.lastBackupStart = state->startpoint;
9116 320 : gotUniqueStartpoint = true;
9117 : }
9118 320 : WALInsertLockRelease();
9119 320 : } while (!gotUniqueStartpoint);
9120 :
9121 : /*
9122 : * Construct tablespace_map file.
9123 : */
9124 320 : datadirpathlen = strlen(DataDir);
9125 :
9126 : /* Collect information about all tablespaces */
9127 320 : tblspcdir = AllocateDir(PG_TBLSPC_DIR);
9128 1032 : while ((de = ReadDir(tblspcdir, PG_TBLSPC_DIR)) != NULL)
9129 : {
9130 : char fullpath[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
9131 : char linkpath[MAXPGPATH];
9132 712 : char *relpath = NULL;
9133 : char *s;
9134 : PGFileType de_type;
9135 : char *badp;
9136 : Oid tsoid;
9137 :
9138 : /*
9139 : * Try to parse the directory name as an unsigned integer.
9140 : *
9141 : * Tablespace directories should be positive integers that can be
9142 : * represented in 32 bits, with no leading zeroes or trailing
9143 : * garbage. If we come across a name that doesn't meet those
9144 : * criteria, skip it.
9145 : */
9146 712 : if (de->d_name[0] < '1' || de->d_name[1] > '9')
9147 640 : continue;
9148 72 : errno = 0;
9149 72 : tsoid = strtoul(de->d_name, &badp, 10);
9150 72 : if (*badp != '\0' || errno == EINVAL || errno == ERANGE)
9151 0 : continue;
9152 :
9153 72 : snprintf(fullpath, sizeof(fullpath), "%s/%s", PG_TBLSPC_DIR, de->d_name);
9154 :
9155 72 : de_type = get_dirent_type(fullpath, de, false, ERROR);
9156 :
9157 72 : if (de_type == PGFILETYPE_LNK)
9158 : {
9159 : StringInfoData escapedpath;
9160 : int rllen;
9161 :
9162 44 : rllen = readlink(fullpath, linkpath, sizeof(linkpath));
9163 44 : if (rllen < 0)
9164 : {
9165 0 : ereport(WARNING,
9166 : (errmsg("could not read symbolic link \"%s\": %m",
9167 : fullpath)));
9168 0 : continue;
9169 : }
9170 44 : else if (rllen >= sizeof(linkpath))
9171 : {
9172 0 : ereport(WARNING,
9173 : (errmsg("symbolic link \"%s\" target is too long",
9174 : fullpath)));
9175 0 : continue;
9176 : }
9177 44 : linkpath[rllen] = '\0';
9178 :
9179 : /*
9180 : * Relpath holds the relative path of the tablespace directory
9181 : * when it's located within PGDATA, or NULL if it's located
9182 : * elsewhere.
9183 : */
9184 44 : if (rllen > datadirpathlen &&
9185 2 : strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
9186 0 : IS_DIR_SEP(linkpath[datadirpathlen]))
9187 0 : relpath = pstrdup(linkpath + datadirpathlen + 1);
9188 :
9189 : /*
9190 : * Add a backslash-escaped version of the link path to the
9191 : * tablespace map file.
9192 : */
9193 44 : initStringInfo(&escapedpath);
9194 1092 : for (s = linkpath; *s; s++)
9195 : {
9196 1048 : if (*s == '\n' || *s == '\r' || *s == '\\')
9197 0 : appendStringInfoChar(&escapedpath, '\\');
9198 1048 : appendStringInfoChar(&escapedpath, *s);
9199 : }
9200 44 : appendStringInfo(tblspcmapfile, "%s %s\n",
9201 44 : de->d_name, escapedpath.data);
9202 44 : pfree(escapedpath.data);
9203 : }
9204 28 : else if (de_type == PGFILETYPE_DIR)
9205 : {
9206 : /*
9207 : * It's possible to use allow_in_place_tablespaces to create
9208 : * directories directly under pg_tblspc, for testing purposes
9209 : * only.
9210 : *
9211 : * In this case, we store a relative path rather than an
9212 : * absolute path into the tablespaceinfo.
9213 : */
9214 28 : snprintf(linkpath, sizeof(linkpath), "%s/%s",
9215 28 : PG_TBLSPC_DIR, de->d_name);
9216 28 : relpath = pstrdup(linkpath);
9217 : }
9218 : else
9219 : {
9220 : /* Skip any other file type that appears here. */
9221 0 : continue;
9222 : }
9223 :
9224 72 : ti = palloc(sizeof(tablespaceinfo));
9225 72 : ti->oid = tsoid;
9226 72 : ti->path = pstrdup(linkpath);
9227 72 : ti->rpath = relpath;
9228 72 : ti->size = -1;
9229 :
9230 72 : if (tablespaces)
9231 72 : *tablespaces = lappend(*tablespaces, ti);
9232 : }
9233 320 : FreeDir(tblspcdir);
9234 :
9235 320 : state->starttime = (pg_time_t) time(NULL);
9236 : }
9237 320 : PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(true));
9238 :
9239 320 : state->started_in_recovery = backup_started_in_recovery;
9240 :
9241 : /*
9242 : * Mark that the start phase has correctly finished for the backup.
9243 : */
9244 320 : sessionBackupState = SESSION_BACKUP_RUNNING;
9245 320 : }
9246 :
9247 : /*
9248 : * Utility routine to fetch the session-level status of a backup running.
9249 : */
9250 : SessionBackupState
9251 362 : get_backup_status(void)
9252 : {
9253 362 : return sessionBackupState;
9254 : }
9255 :
9256 : /*
9257 : * do_pg_backup_stop
9258 : *
9259 : * Utility function called at the end of an online backup. It creates history
9260 : * file (if required), resets sessionBackupState and so on. It can optionally
9261 : * wait for WAL segments to be archived.
9262 : *
9263 : * "state" is filled with the information necessary to restore from this
9264 : * backup with its stop LSN (stoppoint), its timeline ID (stoptli), etc.
9265 : *
9266 : * It is the responsibility of the caller of this function to verify the
9267 : * permissions of the calling user!
9268 : */
9269 : void
9270 308 : do_pg_backup_stop(BackupState *state, bool waitforarchive)
9271 : {
9272 308 : bool backup_stopped_in_recovery = false;
9273 : char histfilepath[MAXPGPATH];
9274 : char lastxlogfilename[MAXFNAMELEN];
9275 : char histfilename[MAXFNAMELEN];
9276 : XLogSegNo _logSegNo;
9277 : FILE *fp;
9278 : int seconds_before_warning;
9279 308 : int waits = 0;
9280 308 : bool reported_waiting = false;
9281 :
9282 : Assert(state != NULL);
9283 :
9284 308 : backup_stopped_in_recovery = RecoveryInProgress();
9285 :
9286 : /*
9287 : * During recovery, we don't need to check WAL level. Because, if WAL
9288 : * level is not sufficient, it's impossible to get here during recovery.
9289 : */
9290 308 : if (!backup_stopped_in_recovery && !XLogIsNeeded())
9291 0 : ereport(ERROR,
9292 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9293 : errmsg("WAL level not sufficient for making an online backup"),
9294 : errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
9295 :
9296 : /*
9297 : * OK to update backup counter and session-level lock.
9298 : *
9299 : * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them,
9300 : * otherwise they can be updated inconsistently, which might cause
9301 : * do_pg_abort_backup() to fail.
9302 : */
9303 308 : WALInsertLockAcquireExclusive();
9304 :
9305 : /*
9306 : * It is expected that each do_pg_backup_start() call is matched by
9307 : * exactly one do_pg_backup_stop() call.
9308 : */
9309 : Assert(XLogCtl->Insert.runningBackups > 0);
9310 308 : XLogCtl->Insert.runningBackups--;
9311 :
9312 : /*
9313 : * Clean up session-level lock.
9314 : *
9315 : * You might think that WALInsertLockRelease() can be called before
9316 : * cleaning up session-level lock because session-level lock doesn't need
9317 : * to be protected with WAL insertion lock. But since
9318 : * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
9319 : * cleaned up before it.
9320 : */
9321 308 : sessionBackupState = SESSION_BACKUP_NONE;
9322 :
9323 308 : WALInsertLockRelease();
9324 :
9325 : /*
9326 : * If we are taking an online backup from the standby, we confirm that the
9327 : * standby has not been promoted during the backup.
9328 : */
9329 308 : if (state->started_in_recovery && !backup_stopped_in_recovery)
9330 0 : ereport(ERROR,
9331 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9332 : errmsg("the standby was promoted during online backup"),
9333 : errhint("This means that the backup being taken is corrupt "
9334 : "and should not be used. "
9335 : "Try taking another online backup.")));
9336 :
9337 : /*
9338 : * During recovery, we don't write an end-of-backup record. We assume that
9339 : * pg_control was backed up last and its minimum recovery point can be
9340 : * available as the backup end location. Since we don't have an
9341 : * end-of-backup record, we use the pg_control value to check whether
9342 : * we've reached the end of backup when starting recovery from this
9343 : * backup. We have no way of checking if pg_control wasn't backed up last
9344 : * however.
9345 : *
9346 : * We don't force a switch to new WAL file but it is still possible to
9347 : * wait for all the required files to be archived if waitforarchive is
9348 : * true. This is okay if we use the backup to start a standby and fetch
9349 : * the missing WAL using streaming replication. But in the case of an
9350 : * archive recovery, a user should set waitforarchive to true and wait for
9351 : * them to be archived to ensure that all the required files are
9352 : * available.
9353 : *
9354 : * We return the current minimum recovery point as the backup end
9355 : * location. Note that it can be greater than the exact backup end
9356 : * location if the minimum recovery point is updated after the backup of
9357 : * pg_control. This is harmless for current uses.
9358 : *
9359 : * XXX currently a backup history file is for informational and debug
9360 : * purposes only. It's not essential for an online backup. Furthermore,
9361 : * even if it's created, it will not be archived during recovery because
9362 : * an archiver is not invoked. So it doesn't seem worthwhile to write a
9363 : * backup history file during recovery.
9364 : */
9365 308 : if (backup_stopped_in_recovery)
9366 : {
9367 : XLogRecPtr recptr;
9368 :
9369 : /*
9370 : * Check to see if all WAL replayed during online backup contain
9371 : * full-page writes.
9372 : */
9373 14 : SpinLockAcquire(&XLogCtl->info_lck);
9374 14 : recptr = XLogCtl->lastFpwDisableRecPtr;
9375 14 : SpinLockRelease(&XLogCtl->info_lck);
9376 :
9377 14 : if (state->startpoint <= recptr)
9378 0 : ereport(ERROR,
9379 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9380 : errmsg("WAL generated with \"full_page_writes=off\" was replayed "
9381 : "during online backup"),
9382 : errhint("This means that the backup being taken on the standby "
9383 : "is corrupt and should not be used. "
9384 : "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
9385 : "and then try an online backup again.")));
9386 :
9387 :
9388 14 : LWLockAcquire(ControlFileLock, LW_SHARED);
9389 14 : state->stoppoint = ControlFile->minRecoveryPoint;
9390 14 : state->stoptli = ControlFile->minRecoveryPointTLI;
9391 14 : LWLockRelease(ControlFileLock);
9392 : }
9393 : else
9394 : {
9395 : char *history_file;
9396 :
9397 : /*
9398 : * Write the backup-end xlog record
9399 : */
9400 294 : XLogBeginInsert();
9401 294 : XLogRegisterData(&state->startpoint,
9402 : sizeof(state->startpoint));
9403 294 : state->stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
9404 :
9405 : /*
9406 : * Given that we're not in recovery, InsertTimeLineID is set and can't
9407 : * change, so we can read it without a lock.
9408 : */
9409 294 : state->stoptli = XLogCtl->InsertTimeLineID;
9410 :
9411 : /*
9412 : * Force a switch to a new xlog segment file, so that the backup is
9413 : * valid as soon as archiver moves out the current segment file.
9414 : */
9415 294 : RequestXLogSwitch(false);
9416 :
9417 294 : state->stoptime = (pg_time_t) time(NULL);
9418 :
9419 : /*
9420 : * Write the backup history file
9421 : */
9422 294 : XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
9423 294 : BackupHistoryFilePath(histfilepath, state->stoptli, _logSegNo,
9424 : state->startpoint, wal_segment_size);
9425 294 : fp = AllocateFile(histfilepath, "w");
9426 294 : if (!fp)
9427 0 : ereport(ERROR,
9428 : (errcode_for_file_access(),
9429 : errmsg("could not create file \"%s\": %m",
9430 : histfilepath)));
9431 :
9432 : /* Build and save the contents of the backup history file */
9433 294 : history_file = build_backup_content(state, true);
9434 294 : fprintf(fp, "%s", history_file);
9435 294 : pfree(history_file);
9436 :
9437 294 : if (fflush(fp) || ferror(fp) || FreeFile(fp))
9438 0 : ereport(ERROR,
9439 : (errcode_for_file_access(),
9440 : errmsg("could not write file \"%s\": %m",
9441 : histfilepath)));
9442 :
9443 : /*
9444 : * Clean out any no-longer-needed history files. As a side effect,
9445 : * this will post a .ready file for the newly created history file,
9446 : * notifying the archiver that history file may be archived
9447 : * immediately.
9448 : */
9449 294 : CleanupBackupHistory();
9450 : }
9451 :
9452 : /*
9453 : * If archiving is enabled, wait for all the required WAL files to be
9454 : * archived before returning. If archiving isn't enabled, the required WAL
9455 : * needs to be transported via streaming replication (hopefully with
9456 : * wal_keep_size set high enough), or some more exotic mechanism like
9457 : * polling and copying files from pg_wal with script. We have no knowledge
9458 : * of those mechanisms, so it's up to the user to ensure that he gets all
9459 : * the required WAL.
9460 : *
9461 : * We wait until both the last WAL file filled during backup and the
9462 : * history file have been archived, and assume that the alphabetic sorting
9463 : * property of the WAL files ensures any earlier WAL files are safely
9464 : * archived as well.
9465 : *
9466 : * We wait forever, since archive_command is supposed to work and we
9467 : * assume the admin wanted his backup to work completely. If you don't
9468 : * wish to wait, then either waitforarchive should be passed in as false,
9469 : * or you can set statement_timeout. Also, some notices are issued to
9470 : * clue in anyone who might be doing this interactively.
9471 : */
9472 :
9473 308 : if (waitforarchive &&
9474 20 : ((!backup_stopped_in_recovery && XLogArchivingActive()) ||
9475 2 : (backup_stopped_in_recovery && XLogArchivingAlways())))
9476 : {
9477 8 : XLByteToPrevSeg(state->stoppoint, _logSegNo, wal_segment_size);
9478 8 : XLogFileName(lastxlogfilename, state->stoptli, _logSegNo,
9479 : wal_segment_size);
9480 :
9481 8 : XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
9482 8 : BackupHistoryFileName(histfilename, state->stoptli, _logSegNo,
9483 : state->startpoint, wal_segment_size);
9484 :
9485 8 : seconds_before_warning = 60;
9486 8 : waits = 0;
9487 :
9488 24 : while (XLogArchiveIsBusy(lastxlogfilename) ||
9489 8 : XLogArchiveIsBusy(histfilename))
9490 : {
9491 8 : CHECK_FOR_INTERRUPTS();
9492 :
9493 8 : if (!reported_waiting && waits > 5)
9494 : {
9495 0 : ereport(NOTICE,
9496 : (errmsg("base backup done, waiting for required WAL segments to be archived")));
9497 0 : reported_waiting = true;
9498 : }
9499 :
9500 8 : (void) WaitLatch(MyLatch,
9501 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
9502 : 1000L,
9503 : WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
9504 8 : ResetLatch(MyLatch);
9505 :
9506 8 : if (++waits >= seconds_before_warning)
9507 : {
9508 0 : seconds_before_warning *= 2; /* This wraps in >10 years... */
9509 0 : ereport(WARNING,
9510 : (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
9511 : waits),
9512 : errhint("Check that your \"archive_command\" is executing properly. "
9513 : "You can safely cancel this backup, "
9514 : "but the database backup will not be usable without all the WAL segments.")));
9515 : }
9516 : }
9517 :
9518 8 : ereport(NOTICE,
9519 : (errmsg("all required WAL segments have been archived")));
9520 : }
9521 300 : else if (waitforarchive)
9522 12 : ereport(NOTICE,
9523 : (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
9524 308 : }
9525 :
9526 :
9527 : /*
9528 : * do_pg_abort_backup: abort a running backup
9529 : *
9530 : * This does just the most basic steps of do_pg_backup_stop(), by taking the
9531 : * system out of backup mode, thus making it a lot more safe to call from
9532 : * an error handler.
9533 : *
9534 : * 'arg' indicates that it's being called during backup setup; so
9535 : * sessionBackupState has not been modified yet, but runningBackups has
9536 : * already been incremented. When it's false, then it's invoked as a
9537 : * before_shmem_exit handler, and therefore we must not change state
9538 : * unless sessionBackupState indicates that a backup is actually running.
9539 : *
9540 : * NB: This gets used as a PG_ENSURE_ERROR_CLEANUP callback and
9541 : * before_shmem_exit handler, hence the odd-looking signature.
9542 : */
9543 : void
9544 16 : do_pg_abort_backup(int code, Datum arg)
9545 : {
9546 16 : bool during_backup_start = DatumGetBool(arg);
9547 :
9548 : /* If called during backup start, there shouldn't be one already running */
9549 : Assert(!during_backup_start || sessionBackupState == SESSION_BACKUP_NONE);
9550 :
9551 16 : if (during_backup_start || sessionBackupState != SESSION_BACKUP_NONE)
9552 : {
9553 12 : WALInsertLockAcquireExclusive();
9554 : Assert(XLogCtl->Insert.runningBackups > 0);
9555 12 : XLogCtl->Insert.runningBackups--;
9556 :
9557 12 : sessionBackupState = SESSION_BACKUP_NONE;
9558 12 : WALInsertLockRelease();
9559 :
9560 12 : if (!during_backup_start)
9561 12 : ereport(WARNING,
9562 : errmsg("aborting backup due to backend exiting before pg_backup_stop was called"));
9563 : }
9564 16 : }
9565 :
9566 : /*
9567 : * Register a handler that will warn about unterminated backups at end of
9568 : * session, unless this has already been done.
9569 : */
9570 : void
9571 8 : register_persistent_abort_backup_handler(void)
9572 : {
9573 : static bool already_done = false;
9574 :
9575 8 : if (already_done)
9576 2 : return;
9577 6 : before_shmem_exit(do_pg_abort_backup, BoolGetDatum(false));
9578 6 : already_done = true;
9579 : }
9580 :
9581 : /*
9582 : * Get latest WAL insert pointer
9583 : */
9584 : XLogRecPtr
9585 3952 : GetXLogInsertRecPtr(void)
9586 : {
9587 3952 : XLogCtlInsert *Insert = &XLogCtl->Insert;
9588 : uint64 current_bytepos;
9589 :
9590 3952 : SpinLockAcquire(&Insert->insertpos_lck);
9591 3952 : current_bytepos = Insert->CurrBytePos;
9592 3952 : SpinLockRelease(&Insert->insertpos_lck);
9593 :
9594 3952 : return XLogBytePosToRecPtr(current_bytepos);
9595 : }
9596 :
9597 : /*
9598 : * Get latest WAL write pointer
9599 : */
9600 : XLogRecPtr
9601 3320 : GetXLogWriteRecPtr(void)
9602 : {
9603 3320 : RefreshXLogWriteResult(LogwrtResult);
9604 :
9605 3320 : return LogwrtResult.Write;
9606 : }
9607 :
9608 : /*
9609 : * Returns the redo pointer of the last checkpoint or restartpoint. This is
9610 : * the oldest point in WAL that we still need, if we have to restart recovery.
9611 : */
9612 : void
9613 784 : GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
9614 : {
9615 784 : LWLockAcquire(ControlFileLock, LW_SHARED);
9616 784 : *oldrecptr = ControlFile->checkPointCopy.redo;
9617 784 : *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
9618 784 : LWLockRelease(ControlFileLock);
9619 784 : }
9620 :
9621 : /* Thin wrapper around ShutdownWalRcv(). */
9622 : void
9623 2078 : XLogShutdownWalRcv(void)
9624 : {
9625 2078 : ShutdownWalRcv();
9626 :
9627 2078 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9628 2078 : XLogCtl->InstallXLogFileSegmentActive = false;
9629 2078 : LWLockRelease(ControlFileLock);
9630 2078 : }
9631 :
9632 : /* Enable WAL file recycling and preallocation. */
9633 : void
9634 2178 : SetInstallXLogFileSegmentActive(void)
9635 : {
9636 2178 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9637 2178 : XLogCtl->InstallXLogFileSegmentActive = true;
9638 2178 : LWLockRelease(ControlFileLock);
9639 2178 : }
9640 :
9641 : bool
9642 0 : IsInstallXLogFileSegmentActive(void)
9643 : {
9644 : bool result;
9645 :
9646 0 : LWLockAcquire(ControlFileLock, LW_SHARED);
9647 0 : result = XLogCtl->InstallXLogFileSegmentActive;
9648 0 : LWLockRelease(ControlFileLock);
9649 :
9650 0 : return result;
9651 : }
9652 :
9653 : /*
9654 : * Update the WalWriterSleeping flag.
9655 : */
9656 : void
9657 948 : SetWalWriterSleeping(bool sleeping)
9658 : {
9659 948 : SpinLockAcquire(&XLogCtl->info_lck);
9660 948 : XLogCtl->WalWriterSleeping = sleeping;
9661 948 : SpinLockRelease(&XLogCtl->info_lck);
9662 948 : }
|