Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlogrecovery.c
4 : * Functions for WAL recovery, standby mode
5 : *
6 : * This source file contains functions controlling WAL recovery.
7 : * InitWalRecovery() initializes the system for crash or archive recovery,
8 : * or standby mode, depending on configuration options and the state of
9 : * the control file and possible backup label file. PerformWalRecovery()
10 : * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 : * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 : * and prepares information needed to initialize the WAL for writes. In
13 : * addition to these three main functions, there are a bunch of functions
14 : * for interrogating recovery state and controlling the recovery process.
15 : *
16 : *
17 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
18 : * Portions Copyright (c) 1994, Regents of the University of California
19 : *
20 : * src/backend/access/transam/xlogrecovery.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 :
25 : #include "postgres.h"
26 :
27 : #include <ctype.h>
28 : #include <math.h>
29 : #include <time.h>
30 : #include <sys/stat.h>
31 : #include <sys/time.h>
32 : #include <unistd.h>
33 :
34 : #include "access/timeline.h"
35 : #include "access/transam.h"
36 : #include "access/xact.h"
37 : #include "access/xlog_internal.h"
38 : #include "access/xlogarchive.h"
39 : #include "access/xlogprefetcher.h"
40 : #include "access/xlogreader.h"
41 : #include "access/xlogrecovery.h"
42 : #include "access/xlogutils.h"
43 : #include "backup/basebackup.h"
44 : #include "catalog/pg_control.h"
45 : #include "commands/tablespace.h"
46 : #include "common/file_utils.h"
47 : #include "miscadmin.h"
48 : #include "nodes/miscnodes.h"
49 : #include "pgstat.h"
50 : #include "postmaster/bgwriter.h"
51 : #include "postmaster/startup.h"
52 : #include "replication/slot.h"
53 : #include "replication/slotsync.h"
54 : #include "replication/walreceiver.h"
55 : #include "storage/fd.h"
56 : #include "storage/ipc.h"
57 : #include "storage/latch.h"
58 : #include "storage/pmsignal.h"
59 : #include "storage/procarray.h"
60 : #include "storage/spin.h"
61 : #include "utils/datetime.h"
62 : #include "utils/fmgrprotos.h"
63 : #include "utils/guc_hooks.h"
64 : #include "utils/pgstat_internal.h"
65 : #include "utils/pg_lsn.h"
66 : #include "utils/ps_status.h"
67 : #include "utils/pg_rusage.h"
68 :
69 : /* Unsupported old recovery command file names (relative to $PGDATA) */
70 : #define RECOVERY_COMMAND_FILE "recovery.conf"
71 : #define RECOVERY_COMMAND_DONE "recovery.done"
72 :
73 : /*
74 : * GUC support
75 : */
76 : const struct config_enum_entry recovery_target_action_options[] = {
77 : {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
78 : {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
79 : {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
80 : {NULL, 0, false}
81 : };
82 :
83 : /* options formerly taken from recovery.conf for archive recovery */
84 : char *recoveryRestoreCommand = NULL;
85 : char *recoveryEndCommand = NULL;
86 : char *archiveCleanupCommand = NULL;
87 : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
88 : bool recoveryTargetInclusive = true;
89 : int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
90 : TransactionId recoveryTargetXid;
91 : char *recovery_target_time_string;
92 : TimestampTz recoveryTargetTime;
93 : const char *recoveryTargetName;
94 : XLogRecPtr recoveryTargetLSN;
95 : int recovery_min_apply_delay = 0;
96 :
97 : /* options formerly taken from recovery.conf for XLOG streaming */
98 : char *PrimaryConnInfo = NULL;
99 : char *PrimarySlotName = NULL;
100 : bool wal_receiver_create_temp_slot = false;
101 :
102 : /*
103 : * recoveryTargetTimeLineGoal: what the user requested, if any
104 : *
105 : * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
106 : *
107 : * recoveryTargetTLI: the currently understood target timeline; changes
108 : *
109 : * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
110 : * the timelines of its known parents, newest first (so recoveryTargetTLI is
111 : * always the first list member). Only these TLIs are expected to be seen in
112 : * the WAL segments we read, and indeed only these TLIs will be considered as
113 : * candidate WAL files to open at all.
114 : *
115 : * curFileTLI: the TLI appearing in the name of the current input WAL file.
116 : * (This is not necessarily the same as the timeline from which we are
117 : * replaying WAL, which StartupXLOG calls replayTLI, because we could be
118 : * scanning data that was copied from an ancestor timeline when the current
119 : * file was created.) During a sequential scan we do not allow this value
120 : * to decrease.
121 : */
122 : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
123 : TimeLineID recoveryTargetTLIRequested = 0;
124 : TimeLineID recoveryTargetTLI = 0;
125 : static List *expectedTLEs;
126 : static TimeLineID curFileTLI;
127 :
128 : /*
129 : * When ArchiveRecoveryRequested is set, archive recovery was requested,
130 : * ie. signal files were present. When InArchiveRecovery is set, we are
131 : * currently recovering using offline XLOG archives. These variables are only
132 : * valid in the startup process.
133 : *
134 : * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
135 : * currently performing crash recovery using only XLOG files in pg_wal, but
136 : * will switch to using offline XLOG archives as soon as we reach the end of
137 : * WAL in pg_wal.
138 : */
139 : bool ArchiveRecoveryRequested = false;
140 : bool InArchiveRecovery = false;
141 :
142 : /*
143 : * When StandbyModeRequested is set, standby mode was requested, i.e.
144 : * standby.signal file was present. When StandbyMode is set, we are currently
145 : * in standby mode. These variables are only valid in the startup process.
146 : * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
147 : */
148 : static bool StandbyModeRequested = false;
149 : bool StandbyMode = false;
150 :
151 : /* was a signal file present at startup? */
152 : static bool standby_signal_file_found = false;
153 : static bool recovery_signal_file_found = false;
154 :
155 : /*
156 : * CheckPointLoc is the position of the checkpoint record that determines
157 : * where to start the replay. It comes from the backup label file or the
158 : * control file.
159 : *
160 : * RedoStartLSN is the checkpoint's REDO location, also from the backup label
161 : * file or the control file. In standby mode, XLOG streaming usually starts
162 : * from the position where an invalid record was found. But if we fail to
163 : * read even the initial checkpoint record, we use the REDO location instead
164 : * of the checkpoint location as the start position of XLOG streaming.
165 : * Otherwise we would have to jump backwards to the REDO location after
166 : * reading the checkpoint record, because the REDO record can precede the
167 : * checkpoint record.
168 : */
169 : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
170 : static TimeLineID CheckPointTLI = 0;
171 : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
172 : static TimeLineID RedoStartTLI = 0;
173 :
174 : /*
175 : * Local copy of SharedHotStandbyActive variable. False actually means "not
176 : * known, need to check the shared state".
177 : */
178 : static bool LocalHotStandbyActive = false;
179 :
180 : /*
181 : * Local copy of SharedPromoteIsTriggered variable. False actually means "not
182 : * known, need to check the shared state".
183 : */
184 : static bool LocalPromoteIsTriggered = false;
185 :
186 : /* Has the recovery code requested a walreceiver wakeup? */
187 : static bool doRequestWalReceiverReply;
188 :
189 : /* XLogReader object used to parse the WAL records */
190 : static XLogReaderState *xlogreader = NULL;
191 :
192 : /* XLogPrefetcher object used to consume WAL records with read-ahead */
193 : static XLogPrefetcher *xlogprefetcher = NULL;
194 :
195 : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
196 : typedef struct XLogPageReadPrivate
197 : {
198 : int emode;
199 : bool fetching_ckpt; /* are we fetching a checkpoint record? */
200 : bool randAccess;
201 : TimeLineID replayTLI;
202 : } XLogPageReadPrivate;
203 :
204 : /* flag to tell XLogPageRead that we have started replaying */
205 : static bool InRedo = false;
206 :
207 : /*
208 : * Codes indicating where we got a WAL file from during recovery, or where
209 : * to attempt to get one.
210 : */
211 : typedef enum
212 : {
213 : XLOG_FROM_ANY = 0, /* request to read WAL from any source */
214 : XLOG_FROM_ARCHIVE, /* restored using restore_command */
215 : XLOG_FROM_PG_WAL, /* existing file in pg_wal */
216 : XLOG_FROM_STREAM, /* streamed from primary */
217 : } XLogSource;
218 :
219 : /* human-readable names for XLogSources, for debugging output */
220 : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
221 :
222 : /*
223 : * readFile is -1 or a kernel FD for the log file segment that's currently
224 : * open for reading. readSegNo identifies the segment. readOff is the offset
225 : * of the page just read, readLen indicates how much of it has been read into
226 : * readBuf, and readSource indicates where we got the currently open file from.
227 : *
228 : * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
229 : * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
230 : * worthwhile, since the XLOG is not read by general-purpose sessions.
231 : */
232 : static int readFile = -1;
233 : static XLogSegNo readSegNo = 0;
234 : static uint32 readOff = 0;
235 : static uint32 readLen = 0;
236 : static XLogSource readSource = XLOG_FROM_ANY;
237 :
238 : /*
239 : * Keeps track of which source we're currently reading from. This is
240 : * different from readSource in that this is always set, even when we don't
241 : * currently have a WAL file open. If lastSourceFailed is set, our last
242 : * attempt to read from currentSource failed, and we should try another source
243 : * next.
244 : *
245 : * pendingWalRcvRestart is set when a config change occurs that requires a
246 : * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
247 : */
248 : static XLogSource currentSource = XLOG_FROM_ANY;
249 : static bool lastSourceFailed = false;
250 : static bool pendingWalRcvRestart = false;
251 :
252 : /*
253 : * These variables track when we last obtained some WAL data to process,
254 : * and where we got it from. (XLogReceiptSource is initially the same as
255 : * readSource, but readSource gets reset to zero when we don't have data
256 : * to process right now. It is also different from currentSource, which
257 : * also changes when we try to read from a source and fail, while
258 : * XLogReceiptSource tracks where we last successfully read some WAL.)
259 : */
260 : static TimestampTz XLogReceiptTime = 0;
261 : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
262 :
263 : /* Local copy of WalRcv->flushedUpto */
264 : static XLogRecPtr flushedUpto = 0;
265 : static TimeLineID receiveTLI = 0;
266 :
267 : /*
268 : * Copy of minRecoveryPoint and backupEndPoint from the control file.
269 : *
270 : * In order to reach consistency, we must replay the WAL up to
271 : * minRecoveryPoint. If backupEndRequired is true, we must also reach
272 : * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
273 : * to backupStartPoint.
274 : *
275 : * Note: In archive recovery, after consistency has been reached, the
276 : * functions in xlog.c will start updating minRecoveryPoint in the control
277 : * file. But this copy of minRecoveryPoint variable reflects the value at the
278 : * beginning of recovery, and is *not* updated after consistency is reached.
279 : */
280 : static XLogRecPtr minRecoveryPoint;
281 : static TimeLineID minRecoveryPointTLI;
282 :
283 : static XLogRecPtr backupStartPoint;
284 : static XLogRecPtr backupEndPoint;
285 : static bool backupEndRequired = false;
286 :
287 : /*
288 : * Have we reached a consistent database state? In crash recovery, we have
289 : * to replay all the WAL, so reachedConsistency is never set. During archive
290 : * recovery, the database is consistent once minRecoveryPoint is reached.
291 : *
292 : * Consistent state means that the system is internally consistent, all
293 : * the WAL has been replayed up to a certain point, and importantly, there
294 : * is no trace of later actions on disk.
295 : *
296 : * This flag is used only by the startup process and postmaster. When
297 : * minRecoveryPoint is reached, the startup process sets it to true and
298 : * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
299 : * which then sets it to true upon receiving the signal.
300 : */
301 : bool reachedConsistency = false;
302 :
303 : /* Buffers dedicated to consistency checks of size BLCKSZ */
304 : static char *replay_image_masked = NULL;
305 : static char *primary_image_masked = NULL;
306 :
307 :
308 : /*
309 : * Shared-memory state for WAL recovery.
310 : */
311 : typedef struct XLogRecoveryCtlData
312 : {
313 : /*
314 : * SharedHotStandbyActive indicates if we allow hot standby queries to be
315 : * run. Protected by info_lck.
316 : */
317 : bool SharedHotStandbyActive;
318 :
319 : /*
320 : * SharedPromoteIsTriggered indicates if a standby promotion has been
321 : * triggered. Protected by info_lck.
322 : */
323 : bool SharedPromoteIsTriggered;
324 :
325 : /*
326 : * recoveryWakeupLatch is used to wake up the startup process to continue
327 : * WAL replay, if it is waiting for WAL to arrive or promotion to be
328 : * requested.
329 : *
330 : * Note that the startup process also uses another latch, its procLatch,
331 : * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
332 : * signaling the startup process in favor of using its procLatch, which
333 : * comports better with possible generic signal handlers using that latch.
334 : * But we should not do that because the startup process doesn't assume
335 : * that it's waken up by walreceiver process or SIGHUP signal handler
336 : * while it's waiting for recovery conflict. The separate latches,
337 : * recoveryWakeupLatch and procLatch, should be used for inter-process
338 : * communication for WAL replay and recovery conflict, respectively.
339 : */
340 : Latch recoveryWakeupLatch;
341 :
342 : /*
343 : * Last record successfully replayed.
344 : */
345 : XLogRecPtr lastReplayedReadRecPtr; /* start position */
346 : XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
347 : TimeLineID lastReplayedTLI; /* timeline */
348 :
349 : /*
350 : * When we're currently replaying a record, ie. in a redo function,
351 : * replayEndRecPtr points to the end+1 of the record being replayed,
352 : * otherwise it's equal to lastReplayedEndRecPtr.
353 : */
354 : XLogRecPtr replayEndRecPtr;
355 : TimeLineID replayEndTLI;
356 : /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
357 : TimestampTz recoveryLastXTime;
358 :
359 : /*
360 : * timestamp of when we started replaying the current chunk of WAL data,
361 : * only relevant for replication or archive recovery
362 : */
363 : TimestampTz currentChunkStartTime;
364 : /* Recovery pause state */
365 : RecoveryPauseState recoveryPauseState;
366 : ConditionVariable recoveryNotPausedCV;
367 :
368 : slock_t info_lck; /* locks shared variables shown above */
369 : } XLogRecoveryCtlData;
370 :
371 : static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
372 :
373 : /*
374 : * abortedRecPtr is the start pointer of a broken record at end of WAL when
375 : * recovery completes; missingContrecPtr is the location of the first
376 : * contrecord that went missing. See CreateOverwriteContrecordRecord for
377 : * details.
378 : */
379 : static XLogRecPtr abortedRecPtr;
380 : static XLogRecPtr missingContrecPtr;
381 :
382 : /*
383 : * if recoveryStopsBefore/After returns true, it saves information of the stop
384 : * point here
385 : */
386 : static TransactionId recoveryStopXid;
387 : static TimestampTz recoveryStopTime;
388 : static XLogRecPtr recoveryStopLSN;
389 : static char recoveryStopName[MAXFNAMELEN];
390 : static bool recoveryStopAfter;
391 :
392 : /* prototypes for local functions */
393 : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
394 :
395 : static void EnableStandbyMode(void);
396 : static void readRecoverySignalFile(void);
397 : static void validateRecoveryParameters(void);
398 : static bool read_backup_label(XLogRecPtr *checkPointLoc,
399 : TimeLineID *backupLabelTLI,
400 : bool *backupEndRequired, bool *backupFromStandby);
401 : static bool read_tablespace_map(List **tablespaces);
402 :
403 : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
404 : static void CheckRecoveryConsistency(void);
405 : static void rm_redo_error_callback(void *arg);
406 : #ifdef WAL_DEBUG
407 : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
408 : #endif
409 : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
410 : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
411 : TimeLineID prevTLI, TimeLineID replayTLI);
412 : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
413 : static void verifyBackupPageConsistency(XLogReaderState *record);
414 :
415 : static bool recoveryStopsBefore(XLogReaderState *record);
416 : static bool recoveryStopsAfter(XLogReaderState *record);
417 : static char *getRecoveryStopReason(void);
418 : static void recoveryPausesHere(bool endOfRecovery);
419 : static bool recoveryApplyDelay(XLogReaderState *record);
420 : static void ConfirmRecoveryPaused(void);
421 :
422 : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
423 : int emode, bool fetching_ckpt,
424 : TimeLineID replayTLI);
425 :
426 : static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
427 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
428 : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
429 : bool randAccess,
430 : bool fetching_ckpt,
431 : XLogRecPtr tliRecPtr,
432 : TimeLineID replayTLI,
433 : XLogRecPtr replayLSN,
434 : bool nonblocking);
435 : static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
436 : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
437 : XLogRecPtr RecPtr, TimeLineID replayTLI);
438 : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
439 : static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
440 : XLogSource source, bool notfoundOk);
441 : static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
442 :
443 : static bool CheckForStandbyTrigger(void);
444 : static void SetPromoteIsTriggered(void);
445 : static bool HotStandbyActiveInReplay(void);
446 :
447 : static void SetCurrentChunkStartTime(TimestampTz xtime);
448 : static void SetLatestXTime(TimestampTz xtime);
449 :
450 : /*
451 : * Initialization of shared memory for WAL recovery
452 : */
453 : Size
454 6240 : XLogRecoveryShmemSize(void)
455 : {
456 : Size size;
457 :
458 : /* XLogRecoveryCtl */
459 6240 : size = sizeof(XLogRecoveryCtlData);
460 :
461 6240 : return size;
462 : }
463 :
464 : void
465 2180 : XLogRecoveryShmemInit(void)
466 : {
467 : bool found;
468 :
469 2180 : XLogRecoveryCtl = (XLogRecoveryCtlData *)
470 2180 : ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
471 2180 : if (found)
472 0 : return;
473 2180 : memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
474 :
475 2180 : SpinLockInit(&XLogRecoveryCtl->info_lck);
476 2180 : InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
477 2180 : ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
478 : }
479 :
480 : /*
481 : * A thin wrapper to enable StandbyMode and do other preparatory work as
482 : * needed.
483 : */
484 : static void
485 206 : EnableStandbyMode(void)
486 : {
487 206 : StandbyMode = true;
488 :
489 : /*
490 : * To avoid server log bloat, we don't report recovery progress in a
491 : * standby as it will always be in recovery unless promoted. We disable
492 : * startup progress timeout in standby mode to avoid calling
493 : * startup_progress_timeout_handler() unnecessarily.
494 : */
495 206 : disable_startup_progress_timeout();
496 206 : }
497 :
498 : /*
499 : * Prepare the system for WAL recovery, if needed.
500 : *
501 : * This is called by StartupXLOG() which coordinates the server startup
502 : * sequence. This function analyzes the control file and the backup label
503 : * file, if any, and figures out whether we need to perform crash recovery or
504 : * archive recovery, and how far we need to replay the WAL to reach a
505 : * consistent state.
506 : *
507 : * This doesn't yet change the on-disk state, except for creating the symlinks
508 : * from table space map file if any, and for fetching WAL files needed to find
509 : * the checkpoint record. On entry, the caller has already read the control
510 : * file into memory, and passes it as argument. This function updates it to
511 : * reflect the recovery state, and the caller is expected to write it back to
512 : * disk does after initializing other subsystems, but before calling
513 : * PerformWalRecovery().
514 : *
515 : * This initializes some global variables like ArchiveRecoveryRequested, and
516 : * StandbyModeRequested and InRecovery.
517 : */
518 : void
519 1896 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
520 : bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
521 : {
522 : XLogPageReadPrivate *private;
523 : struct stat st;
524 : bool wasShutdown;
525 : XLogRecord *record;
526 : DBState dbstate_at_startup;
527 1896 : bool haveTblspcMap = false;
528 1896 : bool haveBackupLabel = false;
529 : CheckPoint checkPoint;
530 1896 : bool backupFromStandby = false;
531 :
532 1896 : dbstate_at_startup = ControlFile->state;
533 :
534 : /*
535 : * Initialize on the assumption we want to recover to the latest timeline
536 : * that's active according to pg_control.
537 : */
538 1896 : if (ControlFile->minRecoveryPointTLI >
539 1896 : ControlFile->checkPointCopy.ThisTimeLineID)
540 4 : recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
541 : else
542 1892 : recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
543 :
544 : /*
545 : * Check for signal files, and if so set up state for offline recovery
546 : */
547 1896 : readRecoverySignalFile();
548 1896 : validateRecoveryParameters();
549 :
550 : /*
551 : * Take ownership of the wakeup latch if we're going to sleep during
552 : * recovery, if required.
553 : */
554 1896 : if (ArchiveRecoveryRequested)
555 216 : OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
556 :
557 : /*
558 : * Set the WAL reading processor now, as it will be needed when reading
559 : * the checkpoint record required (backup_label or not).
560 : */
561 1896 : private = palloc0(sizeof(XLogPageReadPrivate));
562 1896 : xlogreader =
563 1896 : XLogReaderAllocate(wal_segment_size, NULL,
564 1896 : XL_ROUTINE(.page_read = &XLogPageRead,
565 : .segment_open = NULL,
566 : .segment_close = wal_segment_close),
567 : private);
568 1896 : if (!xlogreader)
569 0 : ereport(ERROR,
570 : (errcode(ERRCODE_OUT_OF_MEMORY),
571 : errmsg("out of memory"),
572 : errdetail("Failed while allocating a WAL reading processor.")));
573 1896 : xlogreader->system_identifier = ControlFile->system_identifier;
574 :
575 : /*
576 : * Set the WAL decode buffer size. This limits how far ahead we can read
577 : * in the WAL.
578 : */
579 1896 : XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
580 :
581 : /* Create a WAL prefetcher. */
582 1896 : xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
583 :
584 : /*
585 : * Allocate two page buffers dedicated to WAL consistency checks. We do
586 : * it this way, rather than just making static arrays, for two reasons:
587 : * (1) no need to waste the storage in most instantiations of the backend;
588 : * (2) a static char array isn't guaranteed to have any particular
589 : * alignment, whereas palloc() will provide MAXALIGN'd storage.
590 : */
591 1896 : replay_image_masked = (char *) palloc(BLCKSZ);
592 1896 : primary_image_masked = (char *) palloc(BLCKSZ);
593 :
594 : /*
595 : * Read the backup_label file. We want to run this part of the recovery
596 : * process after checking for signal files and after performing validation
597 : * of the recovery parameters.
598 : */
599 1896 : if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
600 : &backupFromStandby))
601 : {
602 142 : List *tablespaces = NIL;
603 :
604 : /*
605 : * Archive recovery was requested, and thanks to the backup label
606 : * file, we know how far we need to replay to reach consistency. Enter
607 : * archive recovery directly.
608 : */
609 142 : InArchiveRecovery = true;
610 142 : if (StandbyModeRequested)
611 120 : EnableStandbyMode();
612 :
613 : /*
614 : * Omitting backup_label when creating a new replica, PITR node etc.
615 : * unfortunately is a common cause of corruption. Logging that
616 : * backup_label was used makes it a bit easier to exclude that as the
617 : * cause of observed corruption.
618 : *
619 : * Do so before we try to read the checkpoint record (which can fail),
620 : * as otherwise it can be hard to understand why a checkpoint other
621 : * than ControlFile->checkPoint is used.
622 : */
623 142 : ereport(LOG,
624 : errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
625 : LSN_FORMAT_ARGS(RedoStartLSN),
626 : LSN_FORMAT_ARGS(CheckPointLoc),
627 : CheckPointTLI));
628 :
629 : /*
630 : * When a backup_label file is present, we want to roll forward from
631 : * the checkpoint it identifies, rather than using pg_control.
632 : */
633 142 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
634 : CheckPointTLI);
635 142 : if (record != NULL)
636 : {
637 142 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
638 142 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
639 142 : ereport(DEBUG1,
640 : errmsg_internal("checkpoint record is at %X/%08X",
641 : LSN_FORMAT_ARGS(CheckPointLoc)));
642 142 : InRecovery = true; /* force recovery even if SHUTDOWNED */
643 :
644 : /*
645 : * Make sure that REDO location exists. This may not be the case
646 : * if there was a crash during an online backup, which left a
647 : * backup_label around that references a WAL segment that's
648 : * already been archived.
649 : */
650 142 : if (checkPoint.redo < CheckPointLoc)
651 : {
652 142 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
653 142 : if (!ReadRecord(xlogprefetcher, LOG, false,
654 : checkPoint.ThisTimeLineID))
655 0 : ereport(FATAL,
656 : errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
657 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
658 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
659 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
660 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
661 : DataDir, DataDir, DataDir, DataDir));
662 : }
663 : }
664 : else
665 : {
666 0 : ereport(FATAL,
667 : errmsg("could not locate required checkpoint record at %X/%08X",
668 : LSN_FORMAT_ARGS(CheckPointLoc)),
669 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
670 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
671 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
672 : DataDir, DataDir, DataDir, DataDir));
673 : wasShutdown = false; /* keep compiler quiet */
674 : }
675 :
676 : /* Read the tablespace_map file if present and create symlinks. */
677 142 : if (read_tablespace_map(&tablespaces))
678 : {
679 : ListCell *lc;
680 :
681 8 : foreach(lc, tablespaces)
682 : {
683 4 : tablespaceinfo *ti = lfirst(lc);
684 : char *linkloc;
685 :
686 4 : linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
687 :
688 : /*
689 : * Remove the existing symlink if any and Create the symlink
690 : * under PGDATA.
691 : */
692 4 : remove_tablespace_symlink(linkloc);
693 :
694 4 : if (symlink(ti->path, linkloc) < 0)
695 0 : ereport(ERROR,
696 : (errcode_for_file_access(),
697 : errmsg("could not create symbolic link \"%s\": %m",
698 : linkloc)));
699 :
700 4 : pfree(ti->path);
701 4 : pfree(ti);
702 : }
703 :
704 : /* tell the caller to delete it later */
705 4 : haveTblspcMap = true;
706 : }
707 :
708 : /* tell the caller to delete it later */
709 142 : haveBackupLabel = true;
710 : }
711 : else
712 : {
713 : /* No backup_label file has been found if we are here. */
714 :
715 : /*
716 : * If tablespace_map file is present without backup_label file, there
717 : * is no use of such file. There is no harm in retaining it, but it
718 : * is better to get rid of the map file so that we don't have any
719 : * redundant file in data directory and it will avoid any sort of
720 : * confusion. It seems prudent though to just rename the file out of
721 : * the way rather than delete it completely, also we ignore any error
722 : * that occurs in rename operation as even if map file is present
723 : * without backup_label file, it is harmless.
724 : */
725 1754 : if (stat(TABLESPACE_MAP, &st) == 0)
726 : {
727 2 : unlink(TABLESPACE_MAP_OLD);
728 2 : if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
729 2 : ereport(LOG,
730 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
731 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
732 : errdetail("File \"%s\" was renamed to \"%s\".",
733 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
734 : else
735 0 : ereport(LOG,
736 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
737 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
738 : errdetail("Could not rename file \"%s\" to \"%s\": %m.",
739 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
740 : }
741 :
742 : /*
743 : * It's possible that archive recovery was requested, but we don't
744 : * know how far we need to replay the WAL before we reach consistency.
745 : * This can happen for example if a base backup is taken from a
746 : * running server using an atomic filesystem snapshot, without calling
747 : * pg_backup_start/stop. Or if you just kill a running primary server
748 : * and put it into archive recovery by creating a recovery signal
749 : * file.
750 : *
751 : * Our strategy in that case is to perform crash recovery first,
752 : * replaying all the WAL present in pg_wal, and only enter archive
753 : * recovery after that.
754 : *
755 : * But usually we already know how far we need to replay the WAL (up
756 : * to minRecoveryPoint, up to backupEndPoint, or until we see an
757 : * end-of-backup record), and we can enter archive recovery directly.
758 : */
759 1754 : if (ArchiveRecoveryRequested &&
760 86 : (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
761 18 : ControlFile->backupEndRequired ||
762 18 : ControlFile->backupEndPoint != InvalidXLogRecPtr ||
763 18 : ControlFile->state == DB_SHUTDOWNED))
764 : {
765 82 : InArchiveRecovery = true;
766 82 : if (StandbyModeRequested)
767 82 : EnableStandbyMode();
768 : }
769 :
770 : /*
771 : * For the same reason as when starting up with backup_label present,
772 : * emit a log message when we continue initializing from a base
773 : * backup.
774 : */
775 1754 : if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
776 0 : ereport(LOG,
777 : errmsg("restarting backup recovery with redo LSN %X/%08X",
778 : LSN_FORMAT_ARGS(ControlFile->backupStartPoint)));
779 :
780 : /* Get the last valid checkpoint record. */
781 1754 : CheckPointLoc = ControlFile->checkPoint;
782 1754 : CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
783 1754 : RedoStartLSN = ControlFile->checkPointCopy.redo;
784 1754 : RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
785 1754 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
786 : CheckPointTLI);
787 1754 : if (record != NULL)
788 : {
789 1754 : ereport(DEBUG1,
790 : errmsg_internal("checkpoint record is at %X/%08X",
791 : LSN_FORMAT_ARGS(CheckPointLoc)));
792 : }
793 : else
794 : {
795 : /*
796 : * We used to attempt to go back to a secondary checkpoint record
797 : * here, but only when not in standby mode. We now just fail if we
798 : * can't read the last checkpoint because this allows us to
799 : * simplify processing around checkpoints.
800 : */
801 0 : ereport(PANIC,
802 : errmsg("could not locate a valid checkpoint record at %X/%08X",
803 : LSN_FORMAT_ARGS(CheckPointLoc)));
804 : }
805 1754 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
806 1754 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
807 : }
808 :
809 1896 : if (ArchiveRecoveryRequested)
810 : {
811 216 : if (StandbyModeRequested)
812 206 : ereport(LOG,
813 : (errmsg("entering standby mode")));
814 10 : else if (recoveryTarget == RECOVERY_TARGET_XID)
815 0 : ereport(LOG,
816 : (errmsg("starting point-in-time recovery to XID %u",
817 : recoveryTargetXid)));
818 10 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
819 0 : ereport(LOG,
820 : (errmsg("starting point-in-time recovery to %s",
821 : timestamptz_to_str(recoveryTargetTime))));
822 10 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
823 6 : ereport(LOG,
824 : (errmsg("starting point-in-time recovery to \"%s\"",
825 : recoveryTargetName)));
826 4 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
827 0 : ereport(LOG,
828 : errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
829 : LSN_FORMAT_ARGS(recoveryTargetLSN)));
830 4 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
831 0 : ereport(LOG,
832 : (errmsg("starting point-in-time recovery to earliest consistent point")));
833 : else
834 4 : ereport(LOG,
835 : (errmsg("starting archive recovery")));
836 : }
837 :
838 : /*
839 : * If the location of the checkpoint record is not on the expected
840 : * timeline in the history of the requested timeline, we cannot proceed:
841 : * the backup is not part of the history of the requested timeline.
842 : */
843 : Assert(expectedTLEs); /* was initialized by reading checkpoint
844 : * record */
845 1896 : if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
846 : CheckPointTLI)
847 : {
848 : XLogRecPtr switchpoint;
849 :
850 : /*
851 : * tliSwitchPoint will throw an error if the checkpoint's timeline is
852 : * not in expectedTLEs at all.
853 : */
854 0 : switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
855 0 : ereport(FATAL,
856 : (errmsg("requested timeline %u is not a child of this server's history",
857 : recoveryTargetTLI),
858 : /* translator: %s is a backup_label file or a pg_control file */
859 : errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
860 : haveBackupLabel ? "backup_label" : "pg_control",
861 : LSN_FORMAT_ARGS(CheckPointLoc),
862 : CheckPointTLI,
863 : LSN_FORMAT_ARGS(switchpoint))));
864 : }
865 :
866 : /*
867 : * The min recovery point should be part of the requested timeline's
868 : * history, too.
869 : */
870 1896 : if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
871 80 : tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
872 80 : ControlFile->minRecoveryPointTLI)
873 0 : ereport(FATAL,
874 : errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
875 : recoveryTargetTLI,
876 : LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
877 : ControlFile->minRecoveryPointTLI));
878 :
879 1896 : ereport(DEBUG1,
880 : errmsg_internal("redo record is at %X/%08X; shutdown %s",
881 : LSN_FORMAT_ARGS(checkPoint.redo),
882 : wasShutdown ? "true" : "false"));
883 1896 : ereport(DEBUG1,
884 : (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
885 : U64FromFullTransactionId(checkPoint.nextXid),
886 : checkPoint.nextOid)));
887 1896 : ereport(DEBUG1,
888 : (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
889 : checkPoint.nextMulti, checkPoint.nextMultiOffset)));
890 1896 : ereport(DEBUG1,
891 : (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
892 : checkPoint.oldestXid, checkPoint.oldestXidDB)));
893 1896 : ereport(DEBUG1,
894 : (errmsg_internal("oldest MultiXactId: %u, in database %u",
895 : checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
896 1896 : ereport(DEBUG1,
897 : (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
898 : checkPoint.oldestCommitTsXid,
899 : checkPoint.newestCommitTsXid)));
900 1896 : if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
901 0 : ereport(PANIC,
902 : (errmsg("invalid next transaction ID")));
903 :
904 : /* sanity check */
905 1896 : if (checkPoint.redo > CheckPointLoc)
906 0 : ereport(PANIC,
907 : (errmsg("invalid redo in checkpoint record")));
908 :
909 : /*
910 : * Check whether we need to force recovery from WAL. If it appears to
911 : * have been a clean shutdown and we did not have a recovery signal file,
912 : * then assume no recovery needed.
913 : */
914 1896 : if (checkPoint.redo < CheckPointLoc)
915 : {
916 226 : if (wasShutdown)
917 0 : ereport(PANIC,
918 : (errmsg("invalid redo record in shutdown checkpoint")));
919 226 : InRecovery = true;
920 : }
921 1670 : else if (ControlFile->state != DB_SHUTDOWNED)
922 188 : InRecovery = true;
923 1482 : else if (ArchiveRecoveryRequested)
924 : {
925 : /* force recovery due to presence of recovery signal file */
926 14 : InRecovery = true;
927 : }
928 :
929 : /*
930 : * If recovery is needed, update our in-memory copy of pg_control to show
931 : * that we are recovering and to show the selected checkpoint as the place
932 : * we are starting from. We also mark pg_control with any minimum recovery
933 : * stop point obtained from a backup history file.
934 : *
935 : * We don't write the changes to disk yet, though. Only do that after
936 : * initializing various subsystems.
937 : */
938 1896 : if (InRecovery)
939 : {
940 428 : if (InArchiveRecovery)
941 : {
942 224 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
943 : }
944 : else
945 : {
946 204 : ereport(LOG,
947 : (errmsg("database system was not properly shut down; "
948 : "automatic recovery in progress")));
949 204 : if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
950 4 : ereport(LOG,
951 : (errmsg("crash recovery starts in timeline %u "
952 : "and has target timeline %u",
953 : ControlFile->checkPointCopy.ThisTimeLineID,
954 : recoveryTargetTLI)));
955 204 : ControlFile->state = DB_IN_CRASH_RECOVERY;
956 : }
957 428 : ControlFile->checkPoint = CheckPointLoc;
958 428 : ControlFile->checkPointCopy = checkPoint;
959 428 : if (InArchiveRecovery)
960 : {
961 : /* initialize minRecoveryPoint if not set yet */
962 224 : if (ControlFile->minRecoveryPoint < checkPoint.redo)
963 : {
964 148 : ControlFile->minRecoveryPoint = checkPoint.redo;
965 148 : ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
966 : }
967 : }
968 :
969 : /*
970 : * Set backupStartPoint if we're starting recovery from a base backup.
971 : *
972 : * Also set backupEndPoint and use minRecoveryPoint as the backup end
973 : * location if we're starting recovery from a base backup which was
974 : * taken from a standby. In this case, the database system status in
975 : * pg_control must indicate that the database was already in recovery.
976 : * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
977 : * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
978 : * before reaching this point; e.g. because restore_command or
979 : * primary_conninfo were faulty.
980 : *
981 : * Any other state indicates that the backup somehow became corrupted
982 : * and we can't sensibly continue with recovery.
983 : */
984 428 : if (haveBackupLabel)
985 : {
986 142 : ControlFile->backupStartPoint = checkPoint.redo;
987 142 : ControlFile->backupEndRequired = backupEndRequired;
988 :
989 142 : if (backupFromStandby)
990 : {
991 8 : if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
992 : dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
993 0 : ereport(FATAL,
994 : (errmsg("backup_label contains data inconsistent with control file"),
995 : errhint("This means that the backup is corrupted and you will "
996 : "have to use another backup for recovery.")));
997 8 : ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
998 : }
999 : }
1000 : }
1001 :
1002 : /* remember these, so that we know when we have reached consistency */
1003 1896 : backupStartPoint = ControlFile->backupStartPoint;
1004 1896 : backupEndRequired = ControlFile->backupEndRequired;
1005 1896 : backupEndPoint = ControlFile->backupEndPoint;
1006 1896 : if (InArchiveRecovery)
1007 : {
1008 224 : minRecoveryPoint = ControlFile->minRecoveryPoint;
1009 224 : minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1010 : }
1011 : else
1012 : {
1013 1672 : minRecoveryPoint = InvalidXLogRecPtr;
1014 1672 : minRecoveryPointTLI = 0;
1015 : }
1016 :
1017 : /*
1018 : * Start recovery assuming that the final record isn't lost.
1019 : */
1020 1896 : abortedRecPtr = InvalidXLogRecPtr;
1021 1896 : missingContrecPtr = InvalidXLogRecPtr;
1022 :
1023 1896 : *wasShutdown_ptr = wasShutdown;
1024 1896 : *haveBackupLabel_ptr = haveBackupLabel;
1025 1896 : *haveTblspcMap_ptr = haveTblspcMap;
1026 1896 : }
1027 :
1028 : /*
1029 : * See if there are any recovery signal files and if so, set state for
1030 : * recovery.
1031 : *
1032 : * See if there is a recovery command file (recovery.conf), and if so
1033 : * throw an ERROR since as of PG12 we no longer recognize that.
1034 : */
1035 : static void
1036 1896 : readRecoverySignalFile(void)
1037 : {
1038 : struct stat stat_buf;
1039 :
1040 1896 : if (IsBootstrapProcessingMode())
1041 1680 : return;
1042 :
1043 : /*
1044 : * Check for old recovery API file: recovery.conf
1045 : */
1046 1796 : if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1047 0 : ereport(FATAL,
1048 : (errcode_for_file_access(),
1049 : errmsg("using recovery command file \"%s\" is not supported",
1050 : RECOVERY_COMMAND_FILE)));
1051 :
1052 : /*
1053 : * Remove unused .done file, if present. Ignore if absent.
1054 : */
1055 1796 : unlink(RECOVERY_COMMAND_DONE);
1056 :
1057 : /*
1058 : * Check for recovery signal files and if found, fsync them since they
1059 : * represent server state information. We don't sweat too much about the
1060 : * possibility of fsync failure, however.
1061 : *
1062 : * If present, standby signal file takes precedence. If neither is present
1063 : * then we won't enter archive recovery.
1064 : */
1065 1796 : if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1066 : {
1067 : int fd;
1068 :
1069 206 : fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1070 : S_IRUSR | S_IWUSR);
1071 206 : if (fd >= 0)
1072 : {
1073 206 : (void) pg_fsync(fd);
1074 206 : close(fd);
1075 : }
1076 206 : standby_signal_file_found = true;
1077 : }
1078 1590 : else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1079 : {
1080 : int fd;
1081 :
1082 10 : fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1083 : S_IRUSR | S_IWUSR);
1084 10 : if (fd >= 0)
1085 : {
1086 10 : (void) pg_fsync(fd);
1087 10 : close(fd);
1088 : }
1089 10 : recovery_signal_file_found = true;
1090 : }
1091 :
1092 1796 : StandbyModeRequested = false;
1093 1796 : ArchiveRecoveryRequested = false;
1094 1796 : if (standby_signal_file_found)
1095 : {
1096 206 : StandbyModeRequested = true;
1097 206 : ArchiveRecoveryRequested = true;
1098 : }
1099 1590 : else if (recovery_signal_file_found)
1100 : {
1101 10 : StandbyModeRequested = false;
1102 10 : ArchiveRecoveryRequested = true;
1103 : }
1104 : else
1105 1580 : return;
1106 :
1107 : /*
1108 : * We don't support standby mode in standalone backends; that requires
1109 : * other processes such as the WAL receiver to be alive.
1110 : */
1111 216 : if (StandbyModeRequested && !IsUnderPostmaster)
1112 0 : ereport(FATAL,
1113 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1114 : errmsg("standby mode is not supported by single-user servers")));
1115 : }
1116 :
1117 : static void
1118 1896 : validateRecoveryParameters(void)
1119 : {
1120 1896 : if (!ArchiveRecoveryRequested)
1121 1680 : return;
1122 :
1123 : /*
1124 : * Check for compulsory parameters
1125 : */
1126 216 : if (StandbyModeRequested)
1127 : {
1128 206 : if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1129 22 : (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1130 4 : ereport(WARNING,
1131 : (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1132 : errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1133 : }
1134 : else
1135 : {
1136 10 : if (recoveryRestoreCommand == NULL ||
1137 10 : strcmp(recoveryRestoreCommand, "") == 0)
1138 0 : ereport(FATAL,
1139 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1140 : errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1141 : }
1142 :
1143 : /*
1144 : * Override any inconsistent requests. Note that this is a change of
1145 : * behaviour in 9.5; prior to this we simply ignored a request to pause if
1146 : * hot_standby = off, which was surprising behaviour.
1147 : */
1148 216 : if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1149 202 : !EnableHotStandby)
1150 6 : recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1151 :
1152 : /*
1153 : * Final parsing of recovery_target_time string; see also
1154 : * check_recovery_target_time().
1155 : */
1156 216 : if (recoveryTarget == RECOVERY_TARGET_TIME)
1157 : {
1158 0 : recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1159 : CStringGetDatum(recovery_target_time_string),
1160 : ObjectIdGetDatum(InvalidOid),
1161 : Int32GetDatum(-1)));
1162 : }
1163 :
1164 : /*
1165 : * If user specified recovery_target_timeline, validate it or compute the
1166 : * "latest" value. We can't do this until after we've gotten the restore
1167 : * command and set InArchiveRecovery, because we need to fetch timeline
1168 : * history files from the archive.
1169 : */
1170 216 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1171 : {
1172 0 : TimeLineID rtli = recoveryTargetTLIRequested;
1173 :
1174 : /* Timeline 1 does not have a history file, all else should */
1175 0 : if (rtli != 1 && !existsTimeLineHistory(rtli))
1176 0 : ereport(FATAL,
1177 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1178 : errmsg("recovery target timeline %u does not exist",
1179 : rtli)));
1180 0 : recoveryTargetTLI = rtli;
1181 : }
1182 216 : else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1183 : {
1184 : /* We start the "latest" search from pg_control's timeline */
1185 216 : recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1186 : }
1187 : else
1188 : {
1189 : /*
1190 : * else we just use the recoveryTargetTLI as already read from
1191 : * ControlFile
1192 : */
1193 : Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1194 : }
1195 : }
1196 :
1197 : /*
1198 : * read_backup_label: check to see if a backup_label file is present
1199 : *
1200 : * If we see a backup_label during recovery, we assume that we are recovering
1201 : * from a backup dump file, and we therefore roll forward from the checkpoint
1202 : * identified by the label file, NOT what pg_control says. This avoids the
1203 : * problem that pg_control might have been archived one or more checkpoints
1204 : * later than the start of the dump, and so if we rely on it as the start
1205 : * point, we will fail to restore a consistent database state.
1206 : *
1207 : * Returns true if a backup_label was found (and fills the checkpoint
1208 : * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1209 : * returns false if not. If this backup_label came from a streamed backup,
1210 : * *backupEndRequired is set to true. If this backup_label was created during
1211 : * recovery, *backupFromStandby is set to true.
1212 : *
1213 : * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1214 : * and TLI read from the backup file.
1215 : */
1216 : static bool
1217 1896 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1218 : bool *backupEndRequired, bool *backupFromStandby)
1219 : {
1220 : char startxlogfilename[MAXFNAMELEN];
1221 : TimeLineID tli_from_walseg,
1222 : tli_from_file;
1223 : FILE *lfp;
1224 : char ch;
1225 : char backuptype[20];
1226 : char backupfrom[20];
1227 : char backuplabel[MAXPGPATH];
1228 : char backuptime[128];
1229 : uint32 hi,
1230 : lo;
1231 :
1232 : /* suppress possible uninitialized-variable warnings */
1233 1896 : *checkPointLoc = InvalidXLogRecPtr;
1234 1896 : *backupLabelTLI = 0;
1235 1896 : *backupEndRequired = false;
1236 1896 : *backupFromStandby = false;
1237 :
1238 : /*
1239 : * See if label file is present
1240 : */
1241 1896 : lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1242 1896 : if (!lfp)
1243 : {
1244 1754 : if (errno != ENOENT)
1245 0 : ereport(FATAL,
1246 : (errcode_for_file_access(),
1247 : errmsg("could not read file \"%s\": %m",
1248 : BACKUP_LABEL_FILE)));
1249 1754 : return false; /* it's not there, all is fine */
1250 : }
1251 :
1252 : /*
1253 : * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1254 : * is pretty crude, but we are not expecting any variability in the file
1255 : * format).
1256 : */
1257 142 : if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1258 142 : &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1259 0 : ereport(FATAL,
1260 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1261 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1262 142 : RedoStartLSN = ((uint64) hi) << 32 | lo;
1263 142 : RedoStartTLI = tli_from_walseg;
1264 142 : if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1265 142 : &hi, &lo, &ch) != 3 || ch != '\n')
1266 0 : ereport(FATAL,
1267 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1268 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1269 142 : *checkPointLoc = ((uint64) hi) << 32 | lo;
1270 142 : *backupLabelTLI = tli_from_walseg;
1271 :
1272 : /*
1273 : * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1274 : * which could mean either pg_basebackup or the pg_backup_start/stop
1275 : * method was used) or if this label came from somewhere else (the only
1276 : * other option today being from pg_rewind). If this was a streamed
1277 : * backup then we know that we need to play through until we get to the
1278 : * end of the WAL which was generated during the backup (at which point we
1279 : * will have reached consistency and backupEndRequired will be reset to be
1280 : * false).
1281 : */
1282 142 : if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1283 : {
1284 142 : if (strcmp(backuptype, "streamed") == 0)
1285 140 : *backupEndRequired = true;
1286 : }
1287 :
1288 : /*
1289 : * BACKUP FROM lets us know if this was from a primary or a standby. If
1290 : * it was from a standby, we'll double-check that the control file state
1291 : * matches that of a standby.
1292 : */
1293 142 : if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1294 : {
1295 142 : if (strcmp(backupfrom, "standby") == 0)
1296 8 : *backupFromStandby = true;
1297 : }
1298 :
1299 : /*
1300 : * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1301 : * but checking for their presence is useful for debugging and the next
1302 : * sanity checks. Cope also with the fact that the result buffers have a
1303 : * pre-allocated size, hence if the backup_label file has been generated
1304 : * with strings longer than the maximum assumed here an incorrect parsing
1305 : * happens. That's fine as only minor consistency checks are done
1306 : * afterwards.
1307 : */
1308 142 : if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1309 142 : ereport(DEBUG1,
1310 : (errmsg_internal("backup time %s in file \"%s\"",
1311 : backuptime, BACKUP_LABEL_FILE)));
1312 :
1313 142 : if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1314 140 : ereport(DEBUG1,
1315 : (errmsg_internal("backup label %s in file \"%s\"",
1316 : backuplabel, BACKUP_LABEL_FILE)));
1317 :
1318 : /*
1319 : * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1320 : * it as a sanity check if present.
1321 : */
1322 142 : if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1323 : {
1324 140 : if (tli_from_walseg != tli_from_file)
1325 0 : ereport(FATAL,
1326 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1327 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1328 : errdetail("Timeline ID parsed is %u, but expected %u.",
1329 : tli_from_file, tli_from_walseg)));
1330 :
1331 140 : ereport(DEBUG1,
1332 : (errmsg_internal("backup timeline %u in file \"%s\"",
1333 : tli_from_file, BACKUP_LABEL_FILE)));
1334 : }
1335 :
1336 142 : if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1337 0 : ereport(FATAL,
1338 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1339 : errmsg("this is an incremental backup, not a data directory"),
1340 : errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1341 :
1342 142 : if (ferror(lfp) || FreeFile(lfp))
1343 0 : ereport(FATAL,
1344 : (errcode_for_file_access(),
1345 : errmsg("could not read file \"%s\": %m",
1346 : BACKUP_LABEL_FILE)));
1347 :
1348 142 : return true;
1349 : }
1350 :
1351 : /*
1352 : * read_tablespace_map: check to see if a tablespace_map file is present
1353 : *
1354 : * If we see a tablespace_map file during recovery, we assume that we are
1355 : * recovering from a backup dump file, and we therefore need to create symlinks
1356 : * as per the information present in tablespace_map file.
1357 : *
1358 : * Returns true if a tablespace_map file was found (and fills *tablespaces
1359 : * with a tablespaceinfo struct for each tablespace listed in the file);
1360 : * returns false if not.
1361 : */
1362 : static bool
1363 142 : read_tablespace_map(List **tablespaces)
1364 : {
1365 : tablespaceinfo *ti;
1366 : FILE *lfp;
1367 : char str[MAXPGPATH];
1368 : int ch,
1369 : i,
1370 : n;
1371 : bool was_backslash;
1372 :
1373 : /*
1374 : * See if tablespace_map file is present
1375 : */
1376 142 : lfp = AllocateFile(TABLESPACE_MAP, "r");
1377 142 : if (!lfp)
1378 : {
1379 138 : if (errno != ENOENT)
1380 0 : ereport(FATAL,
1381 : (errcode_for_file_access(),
1382 : errmsg("could not read file \"%s\": %m",
1383 : TABLESPACE_MAP)));
1384 138 : return false; /* it's not there, all is fine */
1385 : }
1386 :
1387 : /*
1388 : * Read and parse the link name and path lines from tablespace_map file
1389 : * (this code is pretty crude, but we are not expecting any variability in
1390 : * the file format). De-escape any backslashes that were inserted.
1391 : */
1392 4 : i = 0;
1393 4 : was_backslash = false;
1394 154 : while ((ch = fgetc(lfp)) != EOF)
1395 : {
1396 150 : if (!was_backslash && (ch == '\n' || ch == '\r'))
1397 4 : {
1398 : char *endp;
1399 :
1400 4 : if (i == 0)
1401 0 : continue; /* \r immediately followed by \n */
1402 :
1403 : /*
1404 : * The de-escaped line should contain an OID followed by exactly
1405 : * one space followed by a path. The path might start with
1406 : * spaces, so don't be too liberal about parsing.
1407 : */
1408 4 : str[i] = '\0';
1409 4 : n = 0;
1410 24 : while (str[n] && str[n] != ' ')
1411 20 : n++;
1412 4 : if (n < 1 || n >= i - 1)
1413 0 : ereport(FATAL,
1414 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1415 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1416 4 : str[n++] = '\0';
1417 :
1418 4 : ti = palloc0(sizeof(tablespaceinfo));
1419 4 : errno = 0;
1420 4 : ti->oid = strtoul(str, &endp, 10);
1421 4 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1422 0 : ereport(FATAL,
1423 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1424 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1425 4 : ti->path = pstrdup(str + n);
1426 4 : *tablespaces = lappend(*tablespaces, ti);
1427 :
1428 4 : i = 0;
1429 4 : continue;
1430 : }
1431 146 : else if (!was_backslash && ch == '\\')
1432 0 : was_backslash = true;
1433 : else
1434 : {
1435 146 : if (i < sizeof(str) - 1)
1436 146 : str[i++] = ch;
1437 146 : was_backslash = false;
1438 : }
1439 : }
1440 :
1441 4 : if (i != 0 || was_backslash) /* last line not terminated? */
1442 0 : ereport(FATAL,
1443 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1444 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1445 :
1446 4 : if (ferror(lfp) || FreeFile(lfp))
1447 0 : ereport(FATAL,
1448 : (errcode_for_file_access(),
1449 : errmsg("could not read file \"%s\": %m",
1450 : TABLESPACE_MAP)));
1451 :
1452 4 : return true;
1453 : }
1454 :
1455 : /*
1456 : * Finish WAL recovery.
1457 : *
1458 : * This does not close the 'xlogreader' yet, because in some cases the caller
1459 : * still wants to re-read the last checkpoint record by calling
1460 : * ReadCheckpointRecord().
1461 : *
1462 : * Returns the position of the last valid or applied record, after which new
1463 : * WAL should be appended, information about why recovery was ended, and some
1464 : * other things. See the EndOfWalRecoveryInfo struct for details.
1465 : */
1466 : EndOfWalRecoveryInfo *
1467 1778 : FinishWalRecovery(void)
1468 : {
1469 1778 : EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo));
1470 : XLogRecPtr lastRec;
1471 : TimeLineID lastRecTLI;
1472 : XLogRecPtr endOfLog;
1473 :
1474 : /*
1475 : * Kill WAL receiver, if it's still running, before we continue to write
1476 : * the startup checkpoint and aborted-contrecord records. It will trump
1477 : * over these records and subsequent ones if it's still alive when we
1478 : * start writing WAL.
1479 : */
1480 1778 : XLogShutdownWalRcv();
1481 :
1482 : /*
1483 : * Shutdown the slot sync worker to drop any temporary slots acquired by
1484 : * it and to prevent it from keep trying to fetch the failover slots.
1485 : *
1486 : * We do not update the 'synced' column in 'pg_replication_slots' system
1487 : * view from true to false here, as any failed update could leave 'synced'
1488 : * column false for some slots. This could cause issues during slot sync
1489 : * after restarting the server as a standby. While updating the 'synced'
1490 : * column after switching to the new timeline is an option, it does not
1491 : * simplify the handling for the 'synced' column. Therefore, we retain the
1492 : * 'synced' column as true after promotion as it may provide useful
1493 : * information about the slot origin.
1494 : */
1495 1778 : ShutDownSlotSync();
1496 :
1497 : /*
1498 : * We are now done reading the xlog from stream. Turn off streaming
1499 : * recovery to force fetching the files (which would be required at end of
1500 : * recovery, e.g., timeline history file) from archive or pg_wal.
1501 : *
1502 : * Note that standby mode must be turned off after killing WAL receiver,
1503 : * i.e., calling XLogShutdownWalRcv().
1504 : */
1505 : Assert(!WalRcvStreaming());
1506 1778 : StandbyMode = false;
1507 :
1508 : /*
1509 : * Determine where to start writing WAL next.
1510 : *
1511 : * Re-fetch the last valid or last applied record, so we can identify the
1512 : * exact endpoint of what we consider the valid portion of WAL. There may
1513 : * be an incomplete continuation record after that, in which case
1514 : * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1515 : * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1516 : * it is intentionally missing. See CreateOverwriteContrecordRecord().
1517 : *
1518 : * An important side-effect of this is to load the last page into
1519 : * xlogreader. The caller uses it to initialize the WAL for writing.
1520 : */
1521 1778 : if (!InRecovery)
1522 : {
1523 1468 : lastRec = CheckPointLoc;
1524 1468 : lastRecTLI = CheckPointTLI;
1525 : }
1526 : else
1527 : {
1528 310 : lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1529 310 : lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1530 : }
1531 1778 : XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1532 1778 : (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1533 1778 : endOfLog = xlogreader->EndRecPtr;
1534 :
1535 : /*
1536 : * Remember the TLI in the filename of the XLOG segment containing the
1537 : * end-of-log. It could be different from the timeline that endOfLog
1538 : * nominally belongs to, if there was a timeline switch in that segment,
1539 : * and we were reading the old WAL from a segment belonging to a higher
1540 : * timeline.
1541 : */
1542 1778 : result->endOfLogTLI = xlogreader->seg.ws_tli;
1543 :
1544 1778 : if (ArchiveRecoveryRequested)
1545 : {
1546 : /*
1547 : * We are no longer in archive recovery state.
1548 : *
1549 : * We are now done reading the old WAL. Turn off archive fetching if
1550 : * it was active.
1551 : */
1552 : Assert(InArchiveRecovery);
1553 98 : InArchiveRecovery = false;
1554 :
1555 : /*
1556 : * If the ending log segment is still open, close it (to avoid
1557 : * problems on Windows with trying to rename or delete an open file).
1558 : */
1559 98 : if (readFile >= 0)
1560 : {
1561 98 : close(readFile);
1562 98 : readFile = -1;
1563 : }
1564 : }
1565 :
1566 : /*
1567 : * Copy the last partial block to the caller, for initializing the WAL
1568 : * buffer for appending new WAL.
1569 : */
1570 1778 : if (endOfLog % XLOG_BLCKSZ != 0)
1571 : {
1572 : char *page;
1573 : int len;
1574 : XLogRecPtr pageBeginPtr;
1575 :
1576 1748 : pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1577 : Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1578 :
1579 : /* Copy the valid part of the last block */
1580 1748 : len = endOfLog % XLOG_BLCKSZ;
1581 1748 : page = palloc(len);
1582 1748 : memcpy(page, xlogreader->readBuf, len);
1583 :
1584 1748 : result->lastPageBeginPtr = pageBeginPtr;
1585 1748 : result->lastPage = page;
1586 : }
1587 : else
1588 : {
1589 : /* There is no partial block to copy. */
1590 30 : result->lastPageBeginPtr = endOfLog;
1591 30 : result->lastPage = NULL;
1592 : }
1593 :
1594 : /*
1595 : * Create a comment for the history file to explain why and where timeline
1596 : * changed.
1597 : */
1598 1778 : result->recoveryStopReason = getRecoveryStopReason();
1599 :
1600 1778 : result->lastRec = lastRec;
1601 1778 : result->lastRecTLI = lastRecTLI;
1602 1778 : result->endOfLog = endOfLog;
1603 :
1604 1778 : result->abortedRecPtr = abortedRecPtr;
1605 1778 : result->missingContrecPtr = missingContrecPtr;
1606 :
1607 1778 : result->standby_signal_file_found = standby_signal_file_found;
1608 1778 : result->recovery_signal_file_found = recovery_signal_file_found;
1609 :
1610 1778 : return result;
1611 : }
1612 :
1613 : /*
1614 : * Clean up the WAL reader and leftovers from restoring WAL from archive
1615 : */
1616 : void
1617 1778 : ShutdownWalRecovery(void)
1618 : {
1619 : char recoveryPath[MAXPGPATH];
1620 :
1621 : /* Final update of pg_stat_recovery_prefetch. */
1622 1778 : XLogPrefetcherComputeStats(xlogprefetcher);
1623 :
1624 : /* Shut down xlogreader */
1625 1778 : if (readFile >= 0)
1626 : {
1627 1680 : close(readFile);
1628 1680 : readFile = -1;
1629 : }
1630 1778 : pfree(xlogreader->private_data);
1631 1778 : XLogReaderFree(xlogreader);
1632 1778 : XLogPrefetcherFree(xlogprefetcher);
1633 :
1634 1778 : if (ArchiveRecoveryRequested)
1635 : {
1636 : /*
1637 : * Since there might be a partial WAL segment named RECOVERYXLOG, get
1638 : * rid of it.
1639 : */
1640 98 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1641 98 : unlink(recoveryPath); /* ignore any error */
1642 :
1643 : /* Get rid of any remaining recovered timeline-history file, too */
1644 98 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1645 98 : unlink(recoveryPath); /* ignore any error */
1646 : }
1647 :
1648 : /*
1649 : * We don't need the latch anymore. It's not strictly necessary to disown
1650 : * it, but let's do it for the sake of tidiness.
1651 : */
1652 1778 : if (ArchiveRecoveryRequested)
1653 98 : DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1654 1778 : }
1655 :
1656 : /*
1657 : * Perform WAL recovery.
1658 : *
1659 : * If the system was shut down cleanly, this is never called.
1660 : */
1661 : void
1662 426 : PerformWalRecovery(void)
1663 : {
1664 : XLogRecord *record;
1665 426 : bool reachedRecoveryTarget = false;
1666 : TimeLineID replayTLI;
1667 :
1668 : /*
1669 : * Initialize shared variables for tracking progress of WAL replay, as if
1670 : * we had just replayed the record before the REDO location (or the
1671 : * checkpoint record itself, if it's a shutdown checkpoint).
1672 : */
1673 426 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1674 426 : if (RedoStartLSN < CheckPointLoc)
1675 : {
1676 224 : XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1677 224 : XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1678 224 : XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1679 : }
1680 : else
1681 : {
1682 202 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1683 202 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1684 202 : XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1685 : }
1686 426 : XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1687 426 : XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1688 426 : XLogRecoveryCtl->recoveryLastXTime = 0;
1689 426 : XLogRecoveryCtl->currentChunkStartTime = 0;
1690 426 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1691 426 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1692 :
1693 : /* Also ensure XLogReceiptTime has a sane value */
1694 426 : XLogReceiptTime = GetCurrentTimestamp();
1695 :
1696 : /*
1697 : * Let postmaster know we've started redo now, so that it can launch the
1698 : * archiver if necessary.
1699 : */
1700 426 : if (IsUnderPostmaster)
1701 408 : SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1702 :
1703 : /*
1704 : * Allow read-only connections immediately if we're consistent already.
1705 : */
1706 426 : CheckRecoveryConsistency();
1707 :
1708 : /*
1709 : * Find the first record that logically follows the checkpoint --- it
1710 : * might physically precede it, though.
1711 : */
1712 426 : if (RedoStartLSN < CheckPointLoc)
1713 : {
1714 : /* back up to find the record */
1715 224 : replayTLI = RedoStartTLI;
1716 224 : XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1717 224 : record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1718 :
1719 : /*
1720 : * If a checkpoint record's redo pointer points back to an earlier
1721 : * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1722 : * record.
1723 : */
1724 224 : if (record->xl_rmid != RM_XLOG_ID ||
1725 224 : (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1726 0 : ereport(FATAL,
1727 : errmsg("unexpected record type found at redo point %X/%08X",
1728 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1729 : }
1730 : else
1731 : {
1732 : /* just have to read next record after CheckPoint */
1733 : Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1734 202 : replayTLI = CheckPointTLI;
1735 202 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1736 : }
1737 :
1738 426 : if (record != NULL)
1739 : {
1740 : TimestampTz xtime;
1741 : PGRUsage ru0;
1742 :
1743 408 : pg_rusage_init(&ru0);
1744 :
1745 408 : InRedo = true;
1746 :
1747 408 : RmgrStartup();
1748 :
1749 408 : ereport(LOG,
1750 : errmsg("redo starts at %X/%08X",
1751 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1752 :
1753 : /* Prepare to report progress of the redo phase. */
1754 408 : if (!StandbyMode)
1755 214 : begin_startup_progress_phase();
1756 :
1757 : /*
1758 : * main redo apply loop
1759 : */
1760 : do
1761 : {
1762 5495882 : if (!StandbyMode)
1763 527340 : ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1764 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1765 :
1766 : #ifdef WAL_DEBUG
1767 : if (XLOG_DEBUG)
1768 : {
1769 : StringInfoData buf;
1770 :
1771 : initStringInfo(&buf);
1772 : appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1773 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1774 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1775 : xlog_outrec(&buf, xlogreader);
1776 : appendStringInfoString(&buf, " - ");
1777 : xlog_outdesc(&buf, xlogreader);
1778 : elog(LOG, "%s", buf.data);
1779 : pfree(buf.data);
1780 : }
1781 : #endif
1782 :
1783 : /* Handle interrupt signals of startup process */
1784 5495882 : ProcessStartupProcInterrupts();
1785 :
1786 : /*
1787 : * Pause WAL replay, if requested by a hot-standby session via
1788 : * SetRecoveryPause().
1789 : *
1790 : * Note that we intentionally don't take the info_lck spinlock
1791 : * here. We might therefore read a slightly stale value of the
1792 : * recoveryPause flag, but it can't be very stale (no worse than
1793 : * the last spinlock we did acquire). Since a pause request is a
1794 : * pretty asynchronous thing anyway, possibly responding to it one
1795 : * WAL record later than we otherwise would is a minor issue, so
1796 : * it doesn't seem worth adding another spinlock cycle to prevent
1797 : * that.
1798 : */
1799 5495882 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1800 : RECOVERY_NOT_PAUSED)
1801 0 : recoveryPausesHere(false);
1802 :
1803 : /*
1804 : * Have we reached our recovery target?
1805 : */
1806 5495882 : if (recoveryStopsBefore(xlogreader))
1807 : {
1808 4 : reachedRecoveryTarget = true;
1809 4 : break;
1810 : }
1811 :
1812 : /*
1813 : * If we've been asked to lag the primary, wait on latch until
1814 : * enough time has passed.
1815 : */
1816 5495878 : if (recoveryApplyDelay(xlogreader))
1817 : {
1818 : /*
1819 : * We test for paused recovery again here. If user sets
1820 : * delayed apply, it may be because they expect to pause
1821 : * recovery in case of problems, so we must test again here
1822 : * otherwise pausing during the delay-wait wouldn't work.
1823 : */
1824 0 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1825 : RECOVERY_NOT_PAUSED)
1826 0 : recoveryPausesHere(false);
1827 : }
1828 :
1829 : /*
1830 : * Apply the record
1831 : */
1832 5495878 : ApplyWalRecord(xlogreader, record, &replayTLI);
1833 :
1834 : /* Exit loop if we reached inclusive recovery target */
1835 5495874 : if (recoveryStopsAfter(xlogreader))
1836 : {
1837 10 : reachedRecoveryTarget = true;
1838 10 : break;
1839 : }
1840 :
1841 : /* Else, try to fetch the next WAL record */
1842 5495864 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1843 5495754 : } while (record != NULL);
1844 :
1845 : /*
1846 : * end of main redo apply loop
1847 : */
1848 :
1849 294 : if (reachedRecoveryTarget)
1850 : {
1851 14 : if (!reachedConsistency)
1852 0 : ereport(FATAL,
1853 : (errmsg("requested recovery stop point is before consistent recovery point")));
1854 :
1855 : /*
1856 : * This is the last point where we can restart recovery with a new
1857 : * recovery target, if we shutdown and begin again. After this,
1858 : * Resource Managers may choose to do permanent corrective actions
1859 : * at end of recovery.
1860 : */
1861 14 : switch (recoveryTargetAction)
1862 : {
1863 0 : case RECOVERY_TARGET_ACTION_SHUTDOWN:
1864 :
1865 : /*
1866 : * exit with special return code to request shutdown of
1867 : * postmaster. Log messages issued from postmaster.
1868 : */
1869 0 : proc_exit(3);
1870 :
1871 2 : case RECOVERY_TARGET_ACTION_PAUSE:
1872 2 : SetRecoveryPause(true);
1873 2 : recoveryPausesHere(true);
1874 :
1875 : /* drop into promote */
1876 :
1877 14 : case RECOVERY_TARGET_ACTION_PROMOTE:
1878 14 : break;
1879 : }
1880 : }
1881 :
1882 294 : RmgrCleanup();
1883 :
1884 294 : ereport(LOG,
1885 : errmsg("redo done at %X/%08X system usage: %s",
1886 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1887 : pg_rusage_show(&ru0)));
1888 294 : xtime = GetLatestXTime();
1889 294 : if (xtime)
1890 70 : ereport(LOG,
1891 : (errmsg("last completed transaction was at log time %s",
1892 : timestamptz_to_str(xtime))));
1893 :
1894 294 : InRedo = false;
1895 : }
1896 : else
1897 : {
1898 : /* there are no WAL records following the checkpoint */
1899 18 : ereport(LOG,
1900 : (errmsg("redo is not required")));
1901 : }
1902 :
1903 : /*
1904 : * This check is intentionally after the above log messages that indicate
1905 : * how far recovery went.
1906 : */
1907 312 : if (ArchiveRecoveryRequested &&
1908 100 : recoveryTarget != RECOVERY_TARGET_UNSET &&
1909 16 : !reachedRecoveryTarget)
1910 2 : ereport(FATAL,
1911 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
1912 : errmsg("recovery ended before configured recovery target was reached")));
1913 310 : }
1914 :
1915 : /*
1916 : * Subroutine of PerformWalRecovery, to apply one WAL record.
1917 : */
1918 : static void
1919 5495878 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1920 : {
1921 : ErrorContextCallback errcallback;
1922 5495878 : bool switchedTLI = false;
1923 :
1924 : /* Setup error traceback support for ereport() */
1925 5495878 : errcallback.callback = rm_redo_error_callback;
1926 5495878 : errcallback.arg = xlogreader;
1927 5495878 : errcallback.previous = error_context_stack;
1928 5495878 : error_context_stack = &errcallback;
1929 :
1930 : /*
1931 : * TransamVariables->nextXid must be beyond record's xid.
1932 : */
1933 5495878 : AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1934 :
1935 : /*
1936 : * Before replaying this record, check if this record causes the current
1937 : * timeline to change. The record is already considered to be part of the
1938 : * new timeline, so we update replayTLI before replaying it. That's
1939 : * important so that replayEndTLI, which is recorded as the minimum
1940 : * recovery point's TLI if recovery stops after this record, is set
1941 : * correctly.
1942 : */
1943 5495878 : if (record->xl_rmid == RM_XLOG_ID)
1944 : {
1945 85974 : TimeLineID newReplayTLI = *replayTLI;
1946 85974 : TimeLineID prevReplayTLI = *replayTLI;
1947 85974 : uint8 info = record->xl_info & ~XLR_INFO_MASK;
1948 :
1949 85974 : if (info == XLOG_CHECKPOINT_SHUTDOWN)
1950 : {
1951 : CheckPoint checkPoint;
1952 :
1953 68 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1954 68 : newReplayTLI = checkPoint.ThisTimeLineID;
1955 68 : prevReplayTLI = checkPoint.PrevTimeLineID;
1956 : }
1957 85906 : else if (info == XLOG_END_OF_RECOVERY)
1958 : {
1959 : xl_end_of_recovery xlrec;
1960 :
1961 20 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1962 20 : newReplayTLI = xlrec.ThisTimeLineID;
1963 20 : prevReplayTLI = xlrec.PrevTimeLineID;
1964 : }
1965 :
1966 85974 : if (newReplayTLI != *replayTLI)
1967 : {
1968 : /* Check that it's OK to switch to this TLI */
1969 22 : checkTimeLineSwitch(xlogreader->EndRecPtr,
1970 : newReplayTLI, prevReplayTLI, *replayTLI);
1971 :
1972 : /* Following WAL records should be run with new TLI */
1973 22 : *replayTLI = newReplayTLI;
1974 22 : switchedTLI = true;
1975 : }
1976 : }
1977 :
1978 : /*
1979 : * Update shared replayEndRecPtr before replaying this record, so that
1980 : * XLogFlush will update minRecoveryPoint correctly.
1981 : */
1982 5495878 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1983 5495878 : XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
1984 5495878 : XLogRecoveryCtl->replayEndTLI = *replayTLI;
1985 5495878 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1986 :
1987 : /*
1988 : * If we are attempting to enter Hot Standby mode, process XIDs we see
1989 : */
1990 5495878 : if (standbyState >= STANDBY_INITIALIZED &&
1991 5008248 : TransactionIdIsValid(record->xl_xid))
1992 4901994 : RecordKnownAssignedTransactionIds(record->xl_xid);
1993 :
1994 : /*
1995 : * Some XLOG record types that are related to recovery are processed
1996 : * directly here, rather than in xlog_redo()
1997 : */
1998 5495878 : if (record->xl_rmid == RM_XLOG_ID)
1999 85974 : xlogrecovery_redo(xlogreader, *replayTLI);
2000 :
2001 : /* Now apply the WAL record itself */
2002 5495878 : GetRmgr(record->xl_rmid).rm_redo(xlogreader);
2003 :
2004 : /*
2005 : * After redo, check whether the backup pages associated with the WAL
2006 : * record are consistent with the existing pages. This check is done only
2007 : * if consistency check is enabled for this record.
2008 : */
2009 5495874 : if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2010 4334724 : verifyBackupPageConsistency(xlogreader);
2011 :
2012 : /* Pop the error context stack */
2013 5495874 : error_context_stack = errcallback.previous;
2014 :
2015 : /*
2016 : * Update lastReplayedEndRecPtr after this record has been successfully
2017 : * replayed.
2018 : */
2019 5495874 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2020 5495874 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
2021 5495874 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
2022 5495874 : XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2023 5495874 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2024 :
2025 : /* ------
2026 : * Wakeup walsenders:
2027 : *
2028 : * On the standby, the WAL is flushed first (which will only wake up
2029 : * physical walsenders) and then applied, which will only wake up logical
2030 : * walsenders.
2031 : *
2032 : * Indeed, logical walsenders on standby can't decode and send data until
2033 : * it's been applied.
2034 : *
2035 : * Physical walsenders don't need to be woken up during replay unless
2036 : * cascading replication is allowed and time line change occurred (so that
2037 : * they can notice that they are on a new time line).
2038 : *
2039 : * That's why the wake up conditions are for:
2040 : *
2041 : * - physical walsenders in case of new time line and cascade
2042 : * replication is allowed
2043 : * - logical walsenders in case cascade replication is allowed (could not
2044 : * be created otherwise)
2045 : * ------
2046 : */
2047 5495874 : if (AllowCascadeReplication())
2048 5117510 : WalSndWakeup(switchedTLI, true);
2049 :
2050 : /*
2051 : * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2052 : * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2053 : * a reply to the primary.
2054 : */
2055 5495874 : if (doRequestWalReceiverReply)
2056 : {
2057 4 : doRequestWalReceiverReply = false;
2058 4 : WalRcvForceReply();
2059 : }
2060 :
2061 : /* Allow read-only connections if we're consistent now */
2062 5495874 : CheckRecoveryConsistency();
2063 :
2064 : /* Is this a timeline switch? */
2065 5495874 : if (switchedTLI)
2066 : {
2067 : /*
2068 : * Before we continue on the new timeline, clean up any (possibly
2069 : * bogus) future WAL segments on the old timeline.
2070 : */
2071 22 : RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2072 :
2073 : /* Reset the prefetcher. */
2074 22 : XLogPrefetchReconfigure();
2075 : }
2076 5495874 : }
2077 :
2078 : /*
2079 : * Some XLOG RM record types that are directly related to WAL recovery are
2080 : * handled here rather than in the xlog_redo()
2081 : */
2082 : static void
2083 85974 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2084 : {
2085 85974 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2086 85974 : XLogRecPtr lsn = record->EndRecPtr;
2087 :
2088 : Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2089 :
2090 85974 : if (info == XLOG_OVERWRITE_CONTRECORD)
2091 : {
2092 : /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2093 : xl_overwrite_contrecord xlrec;
2094 :
2095 2 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2096 2 : if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2097 0 : elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2098 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2099 : LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2100 :
2101 : /* We have safely skipped the aborted record */
2102 2 : abortedRecPtr = InvalidXLogRecPtr;
2103 2 : missingContrecPtr = InvalidXLogRecPtr;
2104 :
2105 2 : ereport(LOG,
2106 : errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2107 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2108 : timestamptz_to_str(xlrec.overwrite_time)));
2109 :
2110 : /* Verifying the record should only happen once */
2111 2 : record->overwrittenRecPtr = InvalidXLogRecPtr;
2112 : }
2113 85972 : else if (info == XLOG_BACKUP_END)
2114 : {
2115 : XLogRecPtr startpoint;
2116 :
2117 170 : memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2118 :
2119 170 : if (backupStartPoint == startpoint)
2120 : {
2121 : /*
2122 : * We have reached the end of base backup, the point where
2123 : * pg_backup_stop() was done. The data on disk is now consistent
2124 : * (assuming we have also reached minRecoveryPoint). Set
2125 : * backupEndPoint to the current LSN, so that the next call to
2126 : * CheckRecoveryConsistency() will notice it and do the
2127 : * end-of-backup processing.
2128 : */
2129 138 : elog(DEBUG1, "end of backup record reached");
2130 :
2131 138 : backupEndPoint = lsn;
2132 : }
2133 : else
2134 32 : elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2135 : LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2136 : }
2137 85974 : }
2138 :
2139 : /*
2140 : * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2141 : * directories.
2142 : *
2143 : * Replay of database creation XLOG records for databases that were later
2144 : * dropped can create fake directories in pg_tblspc. By the time consistency
2145 : * is reached these directories should have been removed; here we verify
2146 : * that this did indeed happen. This is to be called at the point where
2147 : * consistent state is reached.
2148 : *
2149 : * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2150 : * useful for testing purposes, and also allows for an escape hatch in case
2151 : * things go south.
2152 : */
2153 : static void
2154 226 : CheckTablespaceDirectory(void)
2155 : {
2156 : DIR *dir;
2157 : struct dirent *de;
2158 :
2159 226 : dir = AllocateDir(PG_TBLSPC_DIR);
2160 692 : while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2161 : {
2162 : char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2163 :
2164 : /* Skip entries of non-oid names */
2165 466 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2166 452 : continue;
2167 :
2168 14 : snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2169 :
2170 14 : if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2171 8 : ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2172 : (errcode(ERRCODE_DATA_CORRUPTED),
2173 : errmsg("unexpected directory entry \"%s\" found in %s",
2174 : de->d_name, PG_TBLSPC_DIR),
2175 : errdetail("All directory entries in %s/ should be symbolic links.",
2176 : PG_TBLSPC_DIR),
2177 : errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2178 : }
2179 226 : }
2180 :
2181 : /*
2182 : * Checks if recovery has reached a consistent state. When consistency is
2183 : * reached and we have a valid starting standby snapshot, tell postmaster
2184 : * that it can start accepting read-only connections.
2185 : */
2186 : static void
2187 5496304 : CheckRecoveryConsistency(void)
2188 : {
2189 : XLogRecPtr lastReplayedEndRecPtr;
2190 : TimeLineID lastReplayedTLI;
2191 :
2192 : /*
2193 : * During crash recovery, we don't reach a consistent state until we've
2194 : * replayed all the WAL.
2195 : */
2196 5496304 : if (XLogRecPtrIsInvalid(minRecoveryPoint))
2197 517122 : return;
2198 :
2199 : Assert(InArchiveRecovery);
2200 :
2201 : /*
2202 : * assume that we are called in the startup process, and hence don't need
2203 : * a lock to read lastReplayedEndRecPtr
2204 : */
2205 4979182 : lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2206 4979182 : lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2207 :
2208 : /*
2209 : * Have we reached the point where our base backup was completed?
2210 : */
2211 4979182 : if (!XLogRecPtrIsInvalid(backupEndPoint) &&
2212 202 : backupEndPoint <= lastReplayedEndRecPtr)
2213 : {
2214 142 : XLogRecPtr saveBackupStartPoint = backupStartPoint;
2215 142 : XLogRecPtr saveBackupEndPoint = backupEndPoint;
2216 :
2217 142 : elog(DEBUG1, "end of backup reached");
2218 :
2219 : /*
2220 : * We have reached the end of base backup, as indicated by pg_control.
2221 : * Update the control file accordingly.
2222 : */
2223 142 : ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2224 142 : backupStartPoint = InvalidXLogRecPtr;
2225 142 : backupEndPoint = InvalidXLogRecPtr;
2226 142 : backupEndRequired = false;
2227 :
2228 142 : ereport(LOG,
2229 : errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2230 : LSN_FORMAT_ARGS(saveBackupStartPoint),
2231 : LSN_FORMAT_ARGS(saveBackupEndPoint)));
2232 : }
2233 :
2234 : /*
2235 : * Have we passed our safe starting point? Note that minRecoveryPoint is
2236 : * known to be incorrectly set if recovering from a backup, until the
2237 : * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2238 : * All we know prior to that is that we're not consistent yet.
2239 : */
2240 4979182 : if (!reachedConsistency && !backupEndRequired &&
2241 15240 : minRecoveryPoint <= lastReplayedEndRecPtr)
2242 : {
2243 : /*
2244 : * Check to see if the XLOG sequence contained any unresolved
2245 : * references to uninitialized pages.
2246 : */
2247 226 : XLogCheckInvalidPages();
2248 :
2249 : /*
2250 : * Check that pg_tblspc doesn't contain any real directories. Replay
2251 : * of Database/CREATE_* records may have created fictitious tablespace
2252 : * directories that should have been removed by the time consistency
2253 : * was reached.
2254 : */
2255 226 : CheckTablespaceDirectory();
2256 :
2257 226 : reachedConsistency = true;
2258 226 : SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
2259 226 : ereport(LOG,
2260 : errmsg("consistent recovery state reached at %X/%08X",
2261 : LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2262 : }
2263 :
2264 : /*
2265 : * Have we got a valid starting snapshot that will allow queries to be
2266 : * run? If so, we can tell postmaster that the database is consistent now,
2267 : * enabling connections.
2268 : */
2269 4979182 : if (standbyState == STANDBY_SNAPSHOT_READY &&
2270 4978740 : !LocalHotStandbyActive &&
2271 210 : reachedConsistency &&
2272 : IsUnderPostmaster)
2273 : {
2274 210 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2275 210 : XLogRecoveryCtl->SharedHotStandbyActive = true;
2276 210 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2277 :
2278 210 : LocalHotStandbyActive = true;
2279 :
2280 210 : SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2281 : }
2282 : }
2283 :
2284 : /*
2285 : * Error context callback for errors occurring during rm_redo().
2286 : */
2287 : static void
2288 196 : rm_redo_error_callback(void *arg)
2289 : {
2290 196 : XLogReaderState *record = (XLogReaderState *) arg;
2291 : StringInfoData buf;
2292 :
2293 196 : initStringInfo(&buf);
2294 196 : xlog_outdesc(&buf, record);
2295 196 : xlog_block_info(&buf, record);
2296 :
2297 : /* translator: %s is a WAL record description */
2298 196 : errcontext("WAL redo at %X/%08X for %s",
2299 196 : LSN_FORMAT_ARGS(record->ReadRecPtr),
2300 : buf.data);
2301 :
2302 196 : pfree(buf.data);
2303 196 : }
2304 :
2305 : /*
2306 : * Returns a string describing an XLogRecord, consisting of its identity
2307 : * optionally followed by a colon, a space, and a further description.
2308 : */
2309 : void
2310 196 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
2311 : {
2312 196 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2313 196 : uint8 info = XLogRecGetInfo(record);
2314 : const char *id;
2315 :
2316 196 : appendStringInfoString(buf, rmgr.rm_name);
2317 196 : appendStringInfoChar(buf, '/');
2318 :
2319 196 : id = rmgr.rm_identify(info);
2320 196 : if (id == NULL)
2321 0 : appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2322 : else
2323 196 : appendStringInfo(buf, "%s: ", id);
2324 :
2325 196 : rmgr.rm_desc(buf, record);
2326 196 : }
2327 :
2328 : #ifdef WAL_DEBUG
2329 :
2330 : static void
2331 : xlog_outrec(StringInfo buf, XLogReaderState *record)
2332 : {
2333 : appendStringInfo(buf, "prev %X/%08X; xid %u",
2334 : LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2335 : XLogRecGetXid(record));
2336 :
2337 : appendStringInfo(buf, "; len %u",
2338 : XLogRecGetDataLen(record));
2339 :
2340 : xlog_block_info(buf, record);
2341 : }
2342 : #endif /* WAL_DEBUG */
2343 :
2344 : /*
2345 : * Returns a string giving information about all the blocks in an
2346 : * XLogRecord.
2347 : */
2348 : static void
2349 196 : xlog_block_info(StringInfo buf, XLogReaderState *record)
2350 : {
2351 : int block_id;
2352 :
2353 : /* decode block references */
2354 280 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2355 : {
2356 : RelFileLocator rlocator;
2357 : ForkNumber forknum;
2358 : BlockNumber blk;
2359 :
2360 84 : if (!XLogRecGetBlockTagExtended(record, block_id,
2361 : &rlocator, &forknum, &blk, NULL))
2362 0 : continue;
2363 :
2364 84 : if (forknum != MAIN_FORKNUM)
2365 10 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2366 : block_id,
2367 : rlocator.spcOid, rlocator.dbOid,
2368 : rlocator.relNumber,
2369 : forknum,
2370 : blk);
2371 : else
2372 74 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2373 : block_id,
2374 : rlocator.spcOid, rlocator.dbOid,
2375 : rlocator.relNumber,
2376 : blk);
2377 84 : if (XLogRecHasBlockImage(record, block_id))
2378 46 : appendStringInfoString(buf, " FPW");
2379 : }
2380 196 : }
2381 :
2382 :
2383 : /*
2384 : * Check that it's OK to switch to new timeline during recovery.
2385 : *
2386 : * 'lsn' is the address of the shutdown checkpoint record we're about to
2387 : * replay. (Currently, timeline can only change at a shutdown checkpoint).
2388 : */
2389 : static void
2390 22 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2391 : TimeLineID replayTLI)
2392 : {
2393 : /* Check that the record agrees on what the current (old) timeline is */
2394 22 : if (prevTLI != replayTLI)
2395 0 : ereport(PANIC,
2396 : (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2397 : prevTLI, replayTLI)));
2398 :
2399 : /*
2400 : * The new timeline better be in the list of timelines we expect to see,
2401 : * according to the timeline history. It should also not decrease.
2402 : */
2403 22 : if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2404 0 : ereport(PANIC,
2405 : (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2406 : newTLI, replayTLI)));
2407 :
2408 : /*
2409 : * If we have not yet reached min recovery point, and we're about to
2410 : * switch to a timeline greater than the timeline of the min recovery
2411 : * point: trouble. After switching to the new timeline, we could not
2412 : * possibly visit the min recovery point on the correct timeline anymore.
2413 : * This can happen if there is a newer timeline in the archive that
2414 : * branched before the timeline the min recovery point is on, and you
2415 : * attempt to do PITR to the new timeline.
2416 : */
2417 22 : if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
2418 18 : lsn < minRecoveryPoint &&
2419 2 : newTLI > minRecoveryPointTLI)
2420 0 : ereport(PANIC,
2421 : errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2422 : newTLI,
2423 : LSN_FORMAT_ARGS(minRecoveryPoint),
2424 : minRecoveryPointTLI));
2425 :
2426 : /* Looks good */
2427 22 : }
2428 :
2429 :
2430 : /*
2431 : * Extract timestamp from WAL record.
2432 : *
2433 : * If the record contains a timestamp, returns true, and saves the timestamp
2434 : * in *recordXtime. If the record type has no timestamp, returns false.
2435 : * Currently, only transaction commit/abort records and restore points contain
2436 : * timestamps.
2437 : */
2438 : static bool
2439 85684 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2440 : {
2441 85684 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2442 85684 : uint8 xact_info = info & XLOG_XACT_OPMASK;
2443 85684 : uint8 rmid = XLogRecGetRmid(record);
2444 :
2445 85684 : if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2446 : {
2447 4 : *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2448 4 : return true;
2449 : }
2450 85680 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2451 : xact_info == XLOG_XACT_COMMIT_PREPARED))
2452 : {
2453 78556 : *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2454 78556 : return true;
2455 : }
2456 7124 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2457 : xact_info == XLOG_XACT_ABORT_PREPARED))
2458 : {
2459 7124 : *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2460 7124 : return true;
2461 : }
2462 0 : return false;
2463 : }
2464 :
2465 : /*
2466 : * Checks whether the current buffer page and backup page stored in the
2467 : * WAL record are consistent or not. Before comparing the two pages, a
2468 : * masking can be applied to the pages to ignore certain areas like hint bits,
2469 : * unused space between pd_lower and pd_upper among other things. This
2470 : * function should be called once WAL replay has been completed for a
2471 : * given record.
2472 : */
2473 : static void
2474 4334724 : verifyBackupPageConsistency(XLogReaderState *record)
2475 : {
2476 4334724 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2477 : RelFileLocator rlocator;
2478 : ForkNumber forknum;
2479 : BlockNumber blkno;
2480 : int block_id;
2481 :
2482 : /* Records with no backup blocks have no need for consistency checks. */
2483 4334724 : if (!XLogRecHasAnyBlockRefs(record))
2484 114 : return;
2485 :
2486 : Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2487 :
2488 9004500 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2489 : {
2490 : Buffer buf;
2491 : Page page;
2492 :
2493 4669890 : if (!XLogRecGetBlockTagExtended(record, block_id,
2494 : &rlocator, &forknum, &blkno, NULL))
2495 : {
2496 : /*
2497 : * WAL record doesn't contain a block reference with the given id.
2498 : * Do nothing.
2499 : */
2500 3962 : continue;
2501 : }
2502 :
2503 : Assert(XLogRecHasBlockImage(record, block_id));
2504 :
2505 4665928 : if (XLogRecBlockImageApply(record, block_id))
2506 : {
2507 : /*
2508 : * WAL record has already applied the page, so bypass the
2509 : * consistency check as that would result in comparing the full
2510 : * page stored in the record with itself.
2511 : */
2512 49714 : continue;
2513 : }
2514 :
2515 : /*
2516 : * Read the contents from the current buffer and store it in a
2517 : * temporary page.
2518 : */
2519 4616214 : buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2520 : RBM_NORMAL_NO_LOG,
2521 : InvalidBuffer);
2522 4616214 : if (!BufferIsValid(buf))
2523 0 : continue;
2524 :
2525 4616214 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2526 4616214 : page = BufferGetPage(buf);
2527 :
2528 : /*
2529 : * Take a copy of the local page where WAL has been applied to have a
2530 : * comparison base before masking it...
2531 : */
2532 4616214 : memcpy(replay_image_masked, page, BLCKSZ);
2533 :
2534 : /* No need for this page anymore now that a copy is in. */
2535 4616214 : UnlockReleaseBuffer(buf);
2536 :
2537 : /*
2538 : * If the block LSN is already ahead of this WAL record, we can't
2539 : * expect contents to match. This can happen if recovery is
2540 : * restarted.
2541 : */
2542 4616214 : if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2543 0 : continue;
2544 :
2545 : /*
2546 : * Read the contents from the backup copy, stored in WAL record and
2547 : * store it in a temporary page. There is no need to allocate a new
2548 : * page here, a local buffer is fine to hold its contents and a mask
2549 : * can be directly applied on it.
2550 : */
2551 4616214 : if (!RestoreBlockImage(record, block_id, primary_image_masked))
2552 0 : ereport(ERROR,
2553 : (errcode(ERRCODE_INTERNAL_ERROR),
2554 : errmsg_internal("%s", record->errormsg_buf)));
2555 :
2556 : /*
2557 : * If masking function is defined, mask both the primary and replay
2558 : * images
2559 : */
2560 4616214 : if (rmgr.rm_mask != NULL)
2561 : {
2562 4616214 : rmgr.rm_mask(replay_image_masked, blkno);
2563 4616214 : rmgr.rm_mask(primary_image_masked, blkno);
2564 : }
2565 :
2566 : /* Time to compare the primary and replay images. */
2567 4616214 : if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2568 : {
2569 0 : elog(FATAL,
2570 : "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2571 : rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2572 : forknum, blkno);
2573 : }
2574 : }
2575 : }
2576 :
2577 : /*
2578 : * For point-in-time recovery, this function decides whether we want to
2579 : * stop applying the XLOG before the current record.
2580 : *
2581 : * Returns true if we are stopping, false otherwise. If stopping, some
2582 : * information is saved in recoveryStopXid et al for use in annotating the
2583 : * new timeline's history file.
2584 : */
2585 : static bool
2586 5495882 : recoveryStopsBefore(XLogReaderState *record)
2587 : {
2588 5495882 : bool stopsHere = false;
2589 : uint8 xact_info;
2590 : bool isCommit;
2591 5495882 : TimestampTz recordXtime = 0;
2592 : TransactionId recordXid;
2593 :
2594 : /*
2595 : * Ignore recovery target settings when not in archive recovery (meaning
2596 : * we are in crash recovery).
2597 : */
2598 5495882 : if (!ArchiveRecoveryRequested)
2599 487602 : return false;
2600 :
2601 : /* Check if we should stop as soon as reaching consistency */
2602 5008280 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2603 : {
2604 0 : ereport(LOG,
2605 : (errmsg("recovery stopping after reaching consistency")));
2606 :
2607 0 : recoveryStopAfter = false;
2608 0 : recoveryStopXid = InvalidTransactionId;
2609 0 : recoveryStopLSN = InvalidXLogRecPtr;
2610 0 : recoveryStopTime = 0;
2611 0 : recoveryStopName[0] = '\0';
2612 0 : return true;
2613 : }
2614 :
2615 : /* Check if target LSN has been reached */
2616 5008280 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2617 16888 : !recoveryTargetInclusive &&
2618 826 : record->ReadRecPtr >= recoveryTargetLSN)
2619 : {
2620 4 : recoveryStopAfter = false;
2621 4 : recoveryStopXid = InvalidTransactionId;
2622 4 : recoveryStopLSN = record->ReadRecPtr;
2623 4 : recoveryStopTime = 0;
2624 4 : recoveryStopName[0] = '\0';
2625 4 : ereport(LOG,
2626 : errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2627 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2628 4 : return true;
2629 : }
2630 :
2631 : /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2632 5008276 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2633 4964860 : return false;
2634 :
2635 43416 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2636 :
2637 43416 : if (xact_info == XLOG_XACT_COMMIT)
2638 : {
2639 39232 : isCommit = true;
2640 39232 : recordXid = XLogRecGetXid(record);
2641 : }
2642 4184 : else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2643 : {
2644 48 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2645 : xl_xact_parsed_commit parsed;
2646 :
2647 48 : isCommit = true;
2648 48 : ParseCommitRecord(XLogRecGetInfo(record),
2649 : xlrec,
2650 : &parsed);
2651 48 : recordXid = parsed.twophase_xid;
2652 : }
2653 4136 : else if (xact_info == XLOG_XACT_ABORT)
2654 : {
2655 3540 : isCommit = false;
2656 3540 : recordXid = XLogRecGetXid(record);
2657 : }
2658 596 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2659 : {
2660 22 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2661 : xl_xact_parsed_abort parsed;
2662 :
2663 22 : isCommit = false;
2664 22 : ParseAbortRecord(XLogRecGetInfo(record),
2665 : xlrec,
2666 : &parsed);
2667 22 : recordXid = parsed.twophase_xid;
2668 : }
2669 : else
2670 574 : return false;
2671 :
2672 42842 : if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2673 : {
2674 : /*
2675 : * There can be only one transaction end record with this exact
2676 : * transactionid
2677 : *
2678 : * when testing for an xid, we MUST test for equality only, since
2679 : * transactions are numbered in the order they start, not the order
2680 : * they complete. A higher numbered xid will complete before you about
2681 : * 50% of the time...
2682 : */
2683 0 : stopsHere = (recordXid == recoveryTargetXid);
2684 : }
2685 :
2686 : /*
2687 : * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2688 : * We don't expect getRecordTimestamp ever to fail, since we already know
2689 : * this is a commit or abort record; but test its result anyway.
2690 : */
2691 42842 : if (getRecordTimestamp(record, &recordXtime) &&
2692 42842 : recoveryTarget == RECOVERY_TARGET_TIME)
2693 : {
2694 : /*
2695 : * There can be many transactions that share the same commit time, so
2696 : * we stop after the last one, if we are inclusive, or stop at the
2697 : * first one if we are exclusive
2698 : */
2699 0 : if (recoveryTargetInclusive)
2700 0 : stopsHere = (recordXtime > recoveryTargetTime);
2701 : else
2702 0 : stopsHere = (recordXtime >= recoveryTargetTime);
2703 : }
2704 :
2705 42842 : if (stopsHere)
2706 : {
2707 0 : recoveryStopAfter = false;
2708 0 : recoveryStopXid = recordXid;
2709 0 : recoveryStopTime = recordXtime;
2710 0 : recoveryStopLSN = InvalidXLogRecPtr;
2711 0 : recoveryStopName[0] = '\0';
2712 :
2713 0 : if (isCommit)
2714 : {
2715 0 : ereport(LOG,
2716 : (errmsg("recovery stopping before commit of transaction %u, time %s",
2717 : recoveryStopXid,
2718 : timestamptz_to_str(recoveryStopTime))));
2719 : }
2720 : else
2721 : {
2722 0 : ereport(LOG,
2723 : (errmsg("recovery stopping before abort of transaction %u, time %s",
2724 : recoveryStopXid,
2725 : timestamptz_to_str(recoveryStopTime))));
2726 : }
2727 : }
2728 :
2729 42842 : return stopsHere;
2730 : }
2731 :
2732 : /*
2733 : * Same as recoveryStopsBefore, but called after applying the record.
2734 : *
2735 : * We also track the timestamp of the latest applied COMMIT/ABORT
2736 : * record in XLogRecoveryCtl->recoveryLastXTime.
2737 : */
2738 : static bool
2739 5495874 : recoveryStopsAfter(XLogReaderState *record)
2740 : {
2741 : uint8 info;
2742 : uint8 xact_info;
2743 : uint8 rmid;
2744 5495874 : TimestampTz recordXtime = 0;
2745 :
2746 : /*
2747 : * Ignore recovery target settings when not in archive recovery (meaning
2748 : * we are in crash recovery).
2749 : */
2750 5495874 : if (!ArchiveRecoveryRequested)
2751 487602 : return false;
2752 :
2753 5008272 : info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2754 5008272 : rmid = XLogRecGetRmid(record);
2755 :
2756 : /*
2757 : * There can be many restore points that share the same name; we stop at
2758 : * the first one.
2759 : */
2760 5008272 : if (recoveryTarget == RECOVERY_TARGET_NAME &&
2761 40 : rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2762 : {
2763 : xl_restore_point *recordRestorePointData;
2764 :
2765 6 : recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2766 :
2767 6 : if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2768 : {
2769 4 : recoveryStopAfter = true;
2770 4 : recoveryStopXid = InvalidTransactionId;
2771 4 : recoveryStopLSN = InvalidXLogRecPtr;
2772 4 : (void) getRecordTimestamp(record, &recoveryStopTime);
2773 4 : strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2774 :
2775 4 : ereport(LOG,
2776 : (errmsg("recovery stopping at restore point \"%s\", time %s",
2777 : recoveryStopName,
2778 : timestamptz_to_str(recoveryStopTime))));
2779 4 : return true;
2780 : }
2781 : }
2782 :
2783 : /* Check if the target LSN has been reached */
2784 5008268 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2785 16062 : recoveryTargetInclusive &&
2786 16062 : record->ReadRecPtr >= recoveryTargetLSN)
2787 : {
2788 6 : recoveryStopAfter = true;
2789 6 : recoveryStopXid = InvalidTransactionId;
2790 6 : recoveryStopLSN = record->ReadRecPtr;
2791 6 : recoveryStopTime = 0;
2792 6 : recoveryStopName[0] = '\0';
2793 6 : ereport(LOG,
2794 : errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2795 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2796 6 : return true;
2797 : }
2798 :
2799 5008262 : if (rmid != RM_XACT_ID)
2800 4964850 : return false;
2801 :
2802 43412 : xact_info = info & XLOG_XACT_OPMASK;
2803 :
2804 43412 : if (xact_info == XLOG_XACT_COMMIT ||
2805 4136 : xact_info == XLOG_XACT_COMMIT_PREPARED ||
2806 596 : xact_info == XLOG_XACT_ABORT ||
2807 : xact_info == XLOG_XACT_ABORT_PREPARED)
2808 : {
2809 : TransactionId recordXid;
2810 :
2811 : /* Update the last applied transaction timestamp */
2812 42838 : if (getRecordTimestamp(record, &recordXtime))
2813 42838 : SetLatestXTime(recordXtime);
2814 :
2815 : /* Extract the XID of the committed/aborted transaction */
2816 42838 : if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2817 : {
2818 48 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2819 : xl_xact_parsed_commit parsed;
2820 :
2821 48 : ParseCommitRecord(XLogRecGetInfo(record),
2822 : xlrec,
2823 : &parsed);
2824 48 : recordXid = parsed.twophase_xid;
2825 : }
2826 42790 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2827 : {
2828 22 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2829 : xl_xact_parsed_abort parsed;
2830 :
2831 22 : ParseAbortRecord(XLogRecGetInfo(record),
2832 : xlrec,
2833 : &parsed);
2834 22 : recordXid = parsed.twophase_xid;
2835 : }
2836 : else
2837 42768 : recordXid = XLogRecGetXid(record);
2838 :
2839 : /*
2840 : * There can be only one transaction end record with this exact
2841 : * transactionid
2842 : *
2843 : * when testing for an xid, we MUST test for equality only, since
2844 : * transactions are numbered in the order they start, not the order
2845 : * they complete. A higher numbered xid will complete before you about
2846 : * 50% of the time...
2847 : */
2848 42838 : if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2849 0 : recordXid == recoveryTargetXid)
2850 : {
2851 0 : recoveryStopAfter = true;
2852 0 : recoveryStopXid = recordXid;
2853 0 : recoveryStopTime = recordXtime;
2854 0 : recoveryStopLSN = InvalidXLogRecPtr;
2855 0 : recoveryStopName[0] = '\0';
2856 :
2857 0 : if (xact_info == XLOG_XACT_COMMIT ||
2858 : xact_info == XLOG_XACT_COMMIT_PREPARED)
2859 : {
2860 0 : ereport(LOG,
2861 : (errmsg("recovery stopping after commit of transaction %u, time %s",
2862 : recoveryStopXid,
2863 : timestamptz_to_str(recoveryStopTime))));
2864 : }
2865 0 : else if (xact_info == XLOG_XACT_ABORT ||
2866 : xact_info == XLOG_XACT_ABORT_PREPARED)
2867 : {
2868 0 : ereport(LOG,
2869 : (errmsg("recovery stopping after abort of transaction %u, time %s",
2870 : recoveryStopXid,
2871 : timestamptz_to_str(recoveryStopTime))));
2872 : }
2873 0 : return true;
2874 : }
2875 : }
2876 :
2877 : /* Check if we should stop as soon as reaching consistency */
2878 43412 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2879 : {
2880 0 : ereport(LOG,
2881 : (errmsg("recovery stopping after reaching consistency")));
2882 :
2883 0 : recoveryStopAfter = true;
2884 0 : recoveryStopXid = InvalidTransactionId;
2885 0 : recoveryStopTime = 0;
2886 0 : recoveryStopLSN = InvalidXLogRecPtr;
2887 0 : recoveryStopName[0] = '\0';
2888 0 : return true;
2889 : }
2890 :
2891 43412 : return false;
2892 : }
2893 :
2894 : /*
2895 : * Create a comment for the history file to explain why and where
2896 : * timeline changed.
2897 : */
2898 : static char *
2899 1778 : getRecoveryStopReason(void)
2900 : {
2901 : char reason[200];
2902 :
2903 1778 : if (recoveryTarget == RECOVERY_TARGET_XID)
2904 0 : snprintf(reason, sizeof(reason),
2905 : "%s transaction %u",
2906 0 : recoveryStopAfter ? "after" : "before",
2907 : recoveryStopXid);
2908 1778 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
2909 0 : snprintf(reason, sizeof(reason),
2910 : "%s %s\n",
2911 0 : recoveryStopAfter ? "after" : "before",
2912 : timestamptz_to_str(recoveryStopTime));
2913 1778 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
2914 14 : snprintf(reason, sizeof(reason),
2915 : "%s LSN %X/%08X\n",
2916 14 : recoveryStopAfter ? "after" : "before",
2917 14 : LSN_FORMAT_ARGS(recoveryStopLSN));
2918 1764 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
2919 6 : snprintf(reason, sizeof(reason),
2920 : "at restore point \"%s\"",
2921 : recoveryStopName);
2922 1758 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2923 0 : snprintf(reason, sizeof(reason), "reached consistency");
2924 : else
2925 1758 : snprintf(reason, sizeof(reason), "no recovery target specified");
2926 :
2927 1778 : return pstrdup(reason);
2928 : }
2929 :
2930 : /*
2931 : * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2932 : *
2933 : * endOfRecovery is true if the recovery target is reached and
2934 : * the paused state starts at the end of recovery because of
2935 : * recovery_target_action=pause, and false otherwise.
2936 : */
2937 : static void
2938 6 : recoveryPausesHere(bool endOfRecovery)
2939 : {
2940 : /* Don't pause unless users can connect! */
2941 6 : if (!LocalHotStandbyActive)
2942 0 : return;
2943 :
2944 : /* Don't pause after standby promotion has been triggered */
2945 6 : if (LocalPromoteIsTriggered)
2946 0 : return;
2947 :
2948 6 : if (endOfRecovery)
2949 2 : ereport(LOG,
2950 : (errmsg("pausing at the end of recovery"),
2951 : errhint("Execute pg_wal_replay_resume() to promote.")));
2952 : else
2953 4 : ereport(LOG,
2954 : (errmsg("recovery has paused"),
2955 : errhint("Execute pg_wal_replay_resume() to continue.")));
2956 :
2957 : /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2958 18 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2959 : {
2960 16 : ProcessStartupProcInterrupts();
2961 16 : if (CheckForStandbyTrigger())
2962 4 : return;
2963 :
2964 : /*
2965 : * If recovery pause is requested then set it paused. While we are in
2966 : * the loop, user might resume and pause again so set this every time.
2967 : */
2968 12 : ConfirmRecoveryPaused();
2969 :
2970 : /*
2971 : * We wait on a condition variable that will wake us as soon as the
2972 : * pause ends, but we use a timeout so we can check the above exit
2973 : * condition periodically too.
2974 : */
2975 12 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2976 : WAIT_EVENT_RECOVERY_PAUSE);
2977 : }
2978 2 : ConditionVariableCancelSleep();
2979 : }
2980 :
2981 : /*
2982 : * When recovery_min_apply_delay is set, we wait long enough to make sure
2983 : * certain record types are applied at least that interval behind the primary.
2984 : *
2985 : * Returns true if we waited.
2986 : *
2987 : * Note that the delay is calculated between the WAL record log time and
2988 : * the current time on standby. We would prefer to keep track of when this
2989 : * standby received each WAL record, which would allow a more consistent
2990 : * approach and one not affected by time synchronisation issues, but that
2991 : * is significantly more effort and complexity for little actual gain in
2992 : * usability.
2993 : */
2994 : static bool
2995 5495878 : recoveryApplyDelay(XLogReaderState *record)
2996 : {
2997 : uint8 xact_info;
2998 : TimestampTz xtime;
2999 : TimestampTz delayUntil;
3000 : long msecs;
3001 :
3002 : /* nothing to do if no delay configured */
3003 5495878 : if (recovery_min_apply_delay <= 0)
3004 5495878 : return false;
3005 :
3006 : /* no delay is applied on a database not yet consistent */
3007 0 : if (!reachedConsistency)
3008 0 : return false;
3009 :
3010 : /* nothing to do if crash recovery is requested */
3011 0 : if (!ArchiveRecoveryRequested)
3012 0 : return false;
3013 :
3014 : /*
3015 : * Is it a COMMIT record?
3016 : *
3017 : * We deliberately choose not to delay aborts since they have no effect on
3018 : * MVCC. We already allow replay of records that don't have a timestamp,
3019 : * so there is already opportunity for issues caused by early conflicts on
3020 : * standbys.
3021 : */
3022 0 : if (XLogRecGetRmid(record) != RM_XACT_ID)
3023 0 : return false;
3024 :
3025 0 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3026 :
3027 0 : if (xact_info != XLOG_XACT_COMMIT &&
3028 : xact_info != XLOG_XACT_COMMIT_PREPARED)
3029 0 : return false;
3030 :
3031 0 : if (!getRecordTimestamp(record, &xtime))
3032 0 : return false;
3033 :
3034 0 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3035 :
3036 : /*
3037 : * Exit without arming the latch if it's already past time to apply this
3038 : * record
3039 : */
3040 0 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3041 0 : if (msecs <= 0)
3042 0 : return false;
3043 :
3044 : while (true)
3045 : {
3046 0 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3047 :
3048 : /* This might change recovery_min_apply_delay. */
3049 0 : ProcessStartupProcInterrupts();
3050 :
3051 0 : if (CheckForStandbyTrigger())
3052 0 : break;
3053 :
3054 : /*
3055 : * Recalculate delayUntil as recovery_min_apply_delay could have
3056 : * changed while waiting in this loop.
3057 : */
3058 0 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3059 :
3060 : /*
3061 : * Wait for difference between GetCurrentTimestamp() and delayUntil.
3062 : */
3063 0 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3064 : delayUntil);
3065 :
3066 0 : if (msecs <= 0)
3067 0 : break;
3068 :
3069 0 : elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3070 :
3071 0 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3072 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3073 : msecs,
3074 : WAIT_EVENT_RECOVERY_APPLY_DELAY);
3075 : }
3076 0 : return true;
3077 : }
3078 :
3079 : /*
3080 : * Get the current state of the recovery pause request.
3081 : */
3082 : RecoveryPauseState
3083 28 : GetRecoveryPauseState(void)
3084 : {
3085 : RecoveryPauseState state;
3086 :
3087 28 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3088 28 : state = XLogRecoveryCtl->recoveryPauseState;
3089 28 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3090 :
3091 28 : return state;
3092 : }
3093 :
3094 : /*
3095 : * Set the recovery pause state.
3096 : *
3097 : * If recovery pause is requested then sets the recovery pause state to
3098 : * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3099 : * to 'not paused' to resume the recovery. The recovery pause will be
3100 : * confirmed by the ConfirmRecoveryPaused.
3101 : */
3102 : void
3103 92 : SetRecoveryPause(bool recoveryPause)
3104 : {
3105 92 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3106 :
3107 92 : if (!recoveryPause)
3108 86 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3109 6 : else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3110 6 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3111 :
3112 92 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3113 :
3114 92 : if (!recoveryPause)
3115 86 : ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3116 92 : }
3117 :
3118 : /*
3119 : * Confirm the recovery pause by setting the recovery pause state to
3120 : * RECOVERY_PAUSED.
3121 : */
3122 : static void
3123 12 : ConfirmRecoveryPaused(void)
3124 : {
3125 : /* If recovery pause is requested then set it paused */
3126 12 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3127 12 : if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3128 6 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3129 12 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3130 12 : }
3131 :
3132 :
3133 : /*
3134 : * Attempt to read the next XLOG record.
3135 : *
3136 : * Before first call, the reader needs to be positioned to the first record
3137 : * by calling XLogPrefetcherBeginRead().
3138 : *
3139 : * If no valid record is available, returns NULL, or fails if emode is PANIC.
3140 : * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3141 : * record is available.
3142 : */
3143 : static XLogRecord *
3144 5500106 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3145 : bool fetching_ckpt, TimeLineID replayTLI)
3146 : {
3147 : XLogRecord *record;
3148 5500106 : XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3149 5500106 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3150 :
3151 : /* Pass through parameters to XLogPageRead */
3152 5500106 : private->fetching_ckpt = fetching_ckpt;
3153 5500106 : private->emode = emode;
3154 5500106 : private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3155 5500106 : private->replayTLI = replayTLI;
3156 :
3157 : /* This is the first attempt to read this page. */
3158 5500106 : lastSourceFailed = false;
3159 :
3160 : for (;;)
3161 236 : {
3162 : char *errormsg;
3163 :
3164 5500342 : record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3165 5500232 : if (record == NULL)
3166 : {
3167 : /*
3168 : * When we find that WAL ends in an incomplete record, keep track
3169 : * of that record. After recovery is done, we'll write a record
3170 : * to indicate to downstream WAL readers that that portion is to
3171 : * be ignored.
3172 : *
3173 : * However, when ArchiveRecoveryRequested = true, we're going to
3174 : * switch to a new timeline at the end of recovery. We will only
3175 : * copy WAL over to the new timeline up to the end of the last
3176 : * complete record, so if we did this, we would later create an
3177 : * overwrite contrecord in the wrong place, breaking everything.
3178 : */
3179 534 : if (!ArchiveRecoveryRequested &&
3180 212 : !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
3181 : {
3182 24 : abortedRecPtr = xlogreader->abortedRecPtr;
3183 24 : missingContrecPtr = xlogreader->missingContrecPtr;
3184 : }
3185 :
3186 534 : if (readFile >= 0)
3187 : {
3188 498 : close(readFile);
3189 498 : readFile = -1;
3190 : }
3191 :
3192 : /*
3193 : * We only end up here without a message when XLogPageRead()
3194 : * failed - in that case we already logged something. In
3195 : * StandbyMode that only happens if we have been triggered, so we
3196 : * shouldn't loop anymore in that case.
3197 : */
3198 534 : if (errormsg)
3199 498 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3200 : (errmsg_internal("%s", errormsg) /* already translated */ ));
3201 : }
3202 :
3203 : /*
3204 : * Check page TLI is one of the expected values.
3205 : */
3206 5499698 : else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3207 : {
3208 : char fname[MAXFNAMELEN];
3209 : XLogSegNo segno;
3210 : int32 offset;
3211 :
3212 0 : XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3213 0 : offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3214 : wal_segment_size);
3215 0 : XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3216 : wal_segment_size);
3217 0 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3218 : errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3219 : xlogreader->latestPageTLI,
3220 : fname,
3221 : LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3222 : offset));
3223 0 : record = NULL;
3224 : }
3225 :
3226 5500232 : if (record)
3227 : {
3228 : /* Great, got a record */
3229 5499996 : return record;
3230 : }
3231 : else
3232 : {
3233 : /* No valid record available from this source */
3234 534 : lastSourceFailed = true;
3235 :
3236 : /*
3237 : * If archive recovery was requested, but we were still doing
3238 : * crash recovery, switch to archive recovery and retry using the
3239 : * offline archive. We have now replayed all the valid WAL in
3240 : * pg_wal, so we are presumably now consistent.
3241 : *
3242 : * We require that there's at least some valid WAL present in
3243 : * pg_wal, however (!fetching_ckpt). We could recover using the
3244 : * WAL from the archive, even if pg_wal is completely empty, but
3245 : * we'd have no idea how far we'd have to replay to reach
3246 : * consistency. So err on the safe side and give up.
3247 : */
3248 534 : if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3249 4 : !fetching_ckpt)
3250 : {
3251 4 : ereport(DEBUG1,
3252 : (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3253 4 : InArchiveRecovery = true;
3254 4 : if (StandbyModeRequested)
3255 4 : EnableStandbyMode();
3256 :
3257 4 : SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3258 4 : minRecoveryPoint = xlogreader->EndRecPtr;
3259 4 : minRecoveryPointTLI = replayTLI;
3260 :
3261 4 : CheckRecoveryConsistency();
3262 :
3263 : /*
3264 : * Before we retry, reset lastSourceFailed and currentSource
3265 : * so that we will check the archive next.
3266 : */
3267 4 : lastSourceFailed = false;
3268 4 : currentSource = XLOG_FROM_ANY;
3269 :
3270 236 : continue;
3271 : }
3272 :
3273 : /* In standby mode, loop back to retry. Otherwise, give up. */
3274 530 : if (StandbyMode && !CheckForStandbyTrigger())
3275 232 : continue;
3276 : else
3277 298 : return NULL;
3278 : }
3279 : }
3280 : }
3281 :
3282 : /*
3283 : * Read the XLOG page containing targetPagePtr into readBuf (if not read
3284 : * already). Returns number of bytes read, if the page is read successfully,
3285 : * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3286 : * but only if they have not been previously reported.
3287 : *
3288 : * See XLogReaderRoutine.page_read for more details.
3289 : *
3290 : * While prefetching, xlogreader->nonblocking may be set. In that case,
3291 : * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3292 : *
3293 : * This is responsible for restoring files from archive as needed, as well
3294 : * as for waiting for the requested WAL record to arrive in standby mode.
3295 : *
3296 : * xlogreader->private_data->emode specifies the log level used for reporting
3297 : * "file not found" or "end of WAL" situations in archive recovery, or in
3298 : * standby mode when promotion is triggered. If set to WARNING or below,
3299 : * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3300 : * levels the ereport() won't return.
3301 : *
3302 : * In standby mode, if after a successful return of XLogPageRead() the
3303 : * caller finds the record it's interested in to be broken, it should
3304 : * ereport the error with the level determined by
3305 : * emode_for_corrupt_record(), and then set lastSourceFailed
3306 : * and call XLogPageRead() again with the same arguments. This lets
3307 : * XLogPageRead() to try fetching the record from another source, or to
3308 : * sleep and retry.
3309 : */
3310 : static int
3311 2825196 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3312 : XLogRecPtr targetRecPtr, char *readBuf)
3313 : {
3314 2825196 : XLogPageReadPrivate *private =
3315 : (XLogPageReadPrivate *) xlogreader->private_data;
3316 2825196 : int emode = private->emode;
3317 : uint32 targetPageOff;
3318 : XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3319 : int r;
3320 : instr_time io_start;
3321 :
3322 2825196 : XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3323 2825196 : targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3324 :
3325 : /*
3326 : * See if we need to switch to a new segment because the requested record
3327 : * is not in the currently open one.
3328 : */
3329 2825196 : if (readFile >= 0 &&
3330 2821802 : !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3331 : {
3332 : /*
3333 : * Request a restartpoint if we've replayed too much xlog since the
3334 : * last one.
3335 : */
3336 2998 : if (ArchiveRecoveryRequested && IsUnderPostmaster)
3337 : {
3338 2968 : if (XLogCheckpointNeeded(readSegNo))
3339 : {
3340 2758 : (void) GetRedoRecPtr();
3341 2758 : if (XLogCheckpointNeeded(readSegNo))
3342 2750 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3343 : }
3344 : }
3345 :
3346 2998 : close(readFile);
3347 2998 : readFile = -1;
3348 2998 : readSource = XLOG_FROM_ANY;
3349 : }
3350 :
3351 2825196 : XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3352 :
3353 2825204 : retry:
3354 : /* See if we need to retrieve more data */
3355 2825204 : if (readFile < 0 ||
3356 2818804 : (readSource == XLOG_FROM_STREAM &&
3357 2794902 : flushedUpto < targetPagePtr + reqLen))
3358 : {
3359 16048 : if (readFile >= 0 &&
3360 9648 : xlogreader->nonblocking &&
3361 4686 : readSource == XLOG_FROM_STREAM &&
3362 4686 : flushedUpto < targetPagePtr + reqLen)
3363 4686 : return XLREAD_WOULDBLOCK;
3364 :
3365 11252 : switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3366 11362 : private->randAccess,
3367 11362 : private->fetching_ckpt,
3368 : targetRecPtr,
3369 : private->replayTLI,
3370 : xlogreader->EndRecPtr,
3371 11362 : xlogreader->nonblocking))
3372 : {
3373 934 : case XLREAD_WOULDBLOCK:
3374 934 : return XLREAD_WOULDBLOCK;
3375 68 : case XLREAD_FAIL:
3376 68 : if (readFile >= 0)
3377 0 : close(readFile);
3378 68 : readFile = -1;
3379 68 : readLen = 0;
3380 68 : readSource = XLOG_FROM_ANY;
3381 68 : return XLREAD_FAIL;
3382 10250 : case XLREAD_SUCCESS:
3383 10250 : break;
3384 : }
3385 : }
3386 :
3387 : /*
3388 : * At this point, we have the right segment open and if we're streaming we
3389 : * know the requested record is in it.
3390 : */
3391 : Assert(readFile != -1);
3392 :
3393 : /*
3394 : * If the current segment is being streamed from the primary, calculate
3395 : * how much of the current page we have received already. We know the
3396 : * requested record has been received, but this is for the benefit of
3397 : * future calls, to allow quick exit at the top of this function.
3398 : */
3399 2819406 : if (readSource == XLOG_FROM_STREAM)
3400 : {
3401 2792378 : if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3402 2789758 : readLen = XLOG_BLCKSZ;
3403 : else
3404 2620 : readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3405 : targetPageOff;
3406 : }
3407 : else
3408 27028 : readLen = XLOG_BLCKSZ;
3409 :
3410 : /* Read the requested page */
3411 2819406 : readOff = targetPageOff;
3412 :
3413 : /* Measure I/O timing when reading segment */
3414 2819406 : io_start = pgstat_prepare_io_time(track_wal_io_timing);
3415 :
3416 2819406 : pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3417 2819406 : r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3418 2819406 : if (r != XLOG_BLCKSZ)
3419 : {
3420 : char fname[MAXFNAMELEN];
3421 0 : int save_errno = errno;
3422 :
3423 0 : pgstat_report_wait_end();
3424 :
3425 0 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3426 : io_start, 1, r);
3427 :
3428 0 : XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3429 0 : if (r < 0)
3430 : {
3431 0 : errno = save_errno;
3432 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3433 : (errcode_for_file_access(),
3434 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3435 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3436 : readOff)));
3437 : }
3438 : else
3439 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3440 : (errcode(ERRCODE_DATA_CORRUPTED),
3441 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3442 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3443 : readOff, r, (Size) XLOG_BLCKSZ)));
3444 0 : goto next_record_is_invalid;
3445 : }
3446 2819406 : pgstat_report_wait_end();
3447 :
3448 2819406 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3449 : io_start, 1, r);
3450 :
3451 : Assert(targetSegNo == readSegNo);
3452 : Assert(targetPageOff == readOff);
3453 : Assert(reqLen <= readLen);
3454 :
3455 2819406 : xlogreader->seg.ws_tli = curFileTLI;
3456 :
3457 : /*
3458 : * Check the page header immediately, so that we can retry immediately if
3459 : * it's not valid. This may seem unnecessary, because ReadPageInternal()
3460 : * validates the page header anyway, and would propagate the failure up to
3461 : * ReadRecord(), which would retry. However, there's a corner case with
3462 : * continuation records, if a record is split across two pages such that
3463 : * we would need to read the two pages from different sources across two
3464 : * WAL segments.
3465 : *
3466 : * The first page is only available locally, in pg_wal, because it's
3467 : * already been recycled on the primary. The second page, however, is not
3468 : * present in pg_wal, and we should stream it from the primary. There is a
3469 : * recycled WAL segment present in pg_wal, with garbage contents, however.
3470 : * We would read the first page from the local WAL segment, but when
3471 : * reading the second page, we would read the bogus, recycled, WAL
3472 : * segment. If we didn't catch that case here, we would never recover,
3473 : * because ReadRecord() would retry reading the whole record from the
3474 : * beginning.
3475 : *
3476 : * Of course, this only catches errors in the page header, which is what
3477 : * happens in the case of a recycled WAL segment. Other kinds of errors or
3478 : * corruption still has the same problem. But this at least fixes the
3479 : * common case, which can happen as part of normal operation.
3480 : *
3481 : * Validating the page header is cheap enough that doing it twice
3482 : * shouldn't be a big deal from a performance point of view.
3483 : *
3484 : * When not in standby mode, an invalid page header should cause recovery
3485 : * to end, not retry reading the page, so we don't need to validate the
3486 : * page header here for the retry. Instead, ReadPageInternal() is
3487 : * responsible for the validation.
3488 : */
3489 2819406 : if (StandbyMode &&
3490 2799708 : (targetPagePtr % wal_segment_size) == 0 &&
3491 2696 : !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3492 : {
3493 : /*
3494 : * Emit this error right now then retry this page immediately. Use
3495 : * errmsg_internal() because the message was already translated.
3496 : */
3497 10 : if (xlogreader->errormsg_buf[0])
3498 10 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3499 : (errmsg_internal("%s", xlogreader->errormsg_buf)));
3500 :
3501 : /* reset any error XLogReaderValidatePageHeader() might have set */
3502 10 : XLogReaderResetError(xlogreader);
3503 10 : goto next_record_is_invalid;
3504 : }
3505 :
3506 2819396 : return readLen;
3507 :
3508 10 : next_record_is_invalid:
3509 :
3510 : /*
3511 : * If we're reading ahead, give up fast. Retries and error reporting will
3512 : * be handled by a later read when recovery catches up to this point.
3513 : */
3514 10 : if (xlogreader->nonblocking)
3515 2 : return XLREAD_WOULDBLOCK;
3516 :
3517 8 : lastSourceFailed = true;
3518 :
3519 8 : if (readFile >= 0)
3520 8 : close(readFile);
3521 8 : readFile = -1;
3522 8 : readLen = 0;
3523 8 : readSource = XLOG_FROM_ANY;
3524 :
3525 : /* In standby-mode, keep trying */
3526 8 : if (StandbyMode)
3527 8 : goto retry;
3528 : else
3529 0 : return XLREAD_FAIL;
3530 : }
3531 :
3532 : /*
3533 : * Open the WAL segment containing WAL location 'RecPtr'.
3534 : *
3535 : * The segment can be fetched via restore_command, or via walreceiver having
3536 : * streamed the record, or it can already be present in pg_wal. Checking
3537 : * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3538 : * too, in case someone copies a new segment directly to pg_wal. That is not
3539 : * documented or recommended, though.
3540 : *
3541 : * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3542 : * prepare to read WAL starting from RedoStartLSN after this.
3543 : *
3544 : * 'RecPtr' might not point to the beginning of the record we're interested
3545 : * in, it might also point to the page or segment header. In that case,
3546 : * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3547 : * used to decide which timeline to stream the requested WAL from.
3548 : *
3549 : * 'replayLSN' is the current replay LSN, so that if we scan for new
3550 : * timelines, we can reject a switch to a timeline that branched off before
3551 : * this point.
3552 : *
3553 : * If the record is not immediately available, the function returns false
3554 : * if we're not in standby mode. In standby mode, waits for it to become
3555 : * available.
3556 : *
3557 : * When the requested record becomes available, the function opens the file
3558 : * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3559 : * of standby mode is triggered by the user, and there is no more WAL
3560 : * available, returns XLREAD_FAIL.
3561 : *
3562 : * If nonblocking is true, then give up immediately if we can't satisfy the
3563 : * request, returning XLREAD_WOULDBLOCK instead of waiting.
3564 : */
3565 : static XLogPageReadResult
3566 11362 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3567 : bool fetching_ckpt, XLogRecPtr tliRecPtr,
3568 : TimeLineID replayTLI, XLogRecPtr replayLSN,
3569 : bool nonblocking)
3570 : {
3571 : static TimestampTz last_fail_time = 0;
3572 : TimestampTz now;
3573 11362 : bool streaming_reply_sent = false;
3574 :
3575 : /*-------
3576 : * Standby mode is implemented by a state machine:
3577 : *
3578 : * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3579 : * pg_wal (XLOG_FROM_PG_WAL)
3580 : * 2. Check for promotion trigger request
3581 : * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3582 : * 4. Rescan timelines
3583 : * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3584 : *
3585 : * Failure to read from the current source advances the state machine to
3586 : * the next state.
3587 : *
3588 : * 'currentSource' indicates the current state. There are no currentSource
3589 : * values for "check trigger", "rescan timelines", and "sleep" states,
3590 : * those actions are taken when reading from the previous source fails, as
3591 : * part of advancing to the next state.
3592 : *
3593 : * If standby mode is turned off while reading WAL from stream, we move
3594 : * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3595 : * the files (which would be required at end of recovery, e.g., timeline
3596 : * history file) from archive or pg_wal. We don't need to kill WAL receiver
3597 : * here because it's already stopped when standby mode is turned off at
3598 : * the end of recovery.
3599 : *-------
3600 : */
3601 11362 : if (!InArchiveRecovery)
3602 1902 : currentSource = XLOG_FROM_PG_WAL;
3603 9460 : else if (currentSource == XLOG_FROM_ANY ||
3604 9232 : (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3605 : {
3606 228 : lastSourceFailed = false;
3607 228 : currentSource = XLOG_FROM_ARCHIVE;
3608 : }
3609 :
3610 : for (;;)
3611 8378 : {
3612 19740 : XLogSource oldSource = currentSource;
3613 19740 : bool startWalReceiver = false;
3614 :
3615 : /*
3616 : * First check if we failed to read from the current source, and
3617 : * advance the state machine if so. The failure to read might've
3618 : * happened outside this function, e.g when a CRC check fails on a
3619 : * record, or within this loop.
3620 : */
3621 19740 : if (lastSourceFailed)
3622 : {
3623 : /*
3624 : * Don't allow any retry loops to occur during nonblocking
3625 : * readahead. Let the caller process everything that has been
3626 : * decoded already first.
3627 : */
3628 948 : if (nonblocking)
3629 138 : return XLREAD_WOULDBLOCK;
3630 :
3631 810 : switch (currentSource)
3632 : {
3633 482 : case XLOG_FROM_ARCHIVE:
3634 : case XLOG_FROM_PG_WAL:
3635 :
3636 : /*
3637 : * Check to see if promotion is requested. Note that we do
3638 : * this only after failure, so when you promote, we still
3639 : * finish replaying as much as we can from archive and
3640 : * pg_wal before failover.
3641 : */
3642 482 : if (StandbyMode && CheckForStandbyTrigger())
3643 : {
3644 28 : XLogShutdownWalRcv();
3645 28 : return XLREAD_FAIL;
3646 : }
3647 :
3648 : /*
3649 : * Not in standby mode, and we've now tried the archive
3650 : * and pg_wal.
3651 : */
3652 454 : if (!StandbyMode)
3653 40 : return XLREAD_FAIL;
3654 :
3655 : /*
3656 : * Move to XLOG_FROM_STREAM state, and set to start a
3657 : * walreceiver if necessary.
3658 : */
3659 414 : currentSource = XLOG_FROM_STREAM;
3660 414 : startWalReceiver = true;
3661 414 : break;
3662 :
3663 328 : case XLOG_FROM_STREAM:
3664 :
3665 : /*
3666 : * Failure while streaming. Most likely, we got here
3667 : * because streaming replication was terminated, or
3668 : * promotion was triggered. But we also get here if we
3669 : * find an invalid record in the WAL streamed from the
3670 : * primary, in which case something is seriously wrong.
3671 : * There's little chance that the problem will just go
3672 : * away, but PANIC is not good for availability either,
3673 : * especially in hot standby mode. So, we treat that the
3674 : * same as disconnection, and retry from archive/pg_wal
3675 : * again. The WAL in the archive should be identical to
3676 : * what was streamed, so it's unlikely that it helps, but
3677 : * one can hope...
3678 : */
3679 :
3680 : /*
3681 : * We should be able to move to XLOG_FROM_STREAM only in
3682 : * standby mode.
3683 : */
3684 : Assert(StandbyMode);
3685 :
3686 : /*
3687 : * Before we leave XLOG_FROM_STREAM state, make sure that
3688 : * walreceiver is not active, so that it won't overwrite
3689 : * WAL that we restore from archive.
3690 : */
3691 328 : XLogShutdownWalRcv();
3692 :
3693 : /*
3694 : * Before we sleep, re-scan for possible new timelines if
3695 : * we were requested to recover to the latest timeline.
3696 : */
3697 328 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3698 : {
3699 328 : if (rescanLatestTimeLine(replayTLI, replayLSN))
3700 : {
3701 12 : currentSource = XLOG_FROM_ARCHIVE;
3702 12 : break;
3703 : }
3704 : }
3705 :
3706 : /*
3707 : * XLOG_FROM_STREAM is the last state in our state
3708 : * machine, so we've exhausted all the options for
3709 : * obtaining the requested WAL. We're going to loop back
3710 : * and retry from the archive, but if it hasn't been long
3711 : * since last attempt, sleep wal_retrieve_retry_interval
3712 : * milliseconds to avoid busy-waiting.
3713 : */
3714 316 : now = GetCurrentTimestamp();
3715 316 : if (!TimestampDifferenceExceeds(last_fail_time, now,
3716 : wal_retrieve_retry_interval))
3717 : {
3718 : long wait_time;
3719 :
3720 356 : wait_time = wal_retrieve_retry_interval -
3721 178 : TimestampDifferenceMilliseconds(last_fail_time, now);
3722 :
3723 178 : elog(LOG, "waiting for WAL to become available at %X/%08X",
3724 : LSN_FORMAT_ARGS(RecPtr));
3725 :
3726 : /* Do background tasks that might benefit us later. */
3727 178 : KnownAssignedTransactionIdsIdleMaintenance();
3728 :
3729 178 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3730 : WL_LATCH_SET | WL_TIMEOUT |
3731 : WL_EXIT_ON_PM_DEATH,
3732 : wait_time,
3733 : WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3734 178 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3735 178 : now = GetCurrentTimestamp();
3736 :
3737 : /* Handle interrupt signals of startup process */
3738 178 : ProcessStartupProcInterrupts();
3739 : }
3740 288 : last_fail_time = now;
3741 288 : currentSource = XLOG_FROM_ARCHIVE;
3742 288 : break;
3743 :
3744 0 : default:
3745 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3746 : }
3747 : }
3748 18792 : else if (currentSource == XLOG_FROM_PG_WAL)
3749 : {
3750 : /*
3751 : * We just successfully read a file in pg_wal. We prefer files in
3752 : * the archive over ones in pg_wal, so try the next file again
3753 : * from the archive first.
3754 : */
3755 1898 : if (InArchiveRecovery)
3756 0 : currentSource = XLOG_FROM_ARCHIVE;
3757 : }
3758 :
3759 19506 : if (currentSource != oldSource)
3760 714 : elog(DEBUG2, "switched WAL source from %s to %s after %s",
3761 : xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3762 : lastSourceFailed ? "failure" : "success");
3763 :
3764 : /*
3765 : * We've now handled possible failure. Try to read from the chosen
3766 : * source.
3767 : */
3768 19506 : lastSourceFailed = false;
3769 :
3770 19506 : switch (currentSource)
3771 : {
3772 3474 : case XLOG_FROM_ARCHIVE:
3773 : case XLOG_FROM_PG_WAL:
3774 :
3775 : /*
3776 : * WAL receiver must not be running when reading WAL from
3777 : * archive or pg_wal.
3778 : */
3779 : Assert(!WalRcvStreaming());
3780 :
3781 : /* Close any old file we might have open. */
3782 3474 : if (readFile >= 0)
3783 : {
3784 146 : close(readFile);
3785 146 : readFile = -1;
3786 : }
3787 : /* Reset curFileTLI if random fetch. */
3788 3474 : if (randAccess)
3789 2198 : curFileTLI = 0;
3790 :
3791 : /*
3792 : * Try to restore the file from archive, or read an existing
3793 : * file from pg_wal.
3794 : */
3795 3474 : readFile = XLogFileReadAnyTLI(readSegNo,
3796 3474 : currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3797 : currentSource);
3798 3474 : if (readFile >= 0)
3799 3126 : return XLREAD_SUCCESS; /* success! */
3800 :
3801 : /*
3802 : * Nope, not found in archive or pg_wal.
3803 : */
3804 348 : lastSourceFailed = true;
3805 348 : break;
3806 :
3807 16032 : case XLOG_FROM_STREAM:
3808 : {
3809 : bool havedata;
3810 :
3811 : /*
3812 : * We should be able to move to XLOG_FROM_STREAM only in
3813 : * standby mode.
3814 : */
3815 : Assert(StandbyMode);
3816 :
3817 : /*
3818 : * First, shutdown walreceiver if its restart has been
3819 : * requested -- but no point if we're already slated for
3820 : * starting it.
3821 : */
3822 16032 : if (pendingWalRcvRestart && !startWalReceiver)
3823 : {
3824 8 : XLogShutdownWalRcv();
3825 :
3826 : /*
3827 : * Re-scan for possible new timelines if we were
3828 : * requested to recover to the latest timeline.
3829 : */
3830 8 : if (recoveryTargetTimeLineGoal ==
3831 : RECOVERY_TARGET_TIMELINE_LATEST)
3832 8 : rescanLatestTimeLine(replayTLI, replayLSN);
3833 :
3834 8 : startWalReceiver = true;
3835 : }
3836 16032 : pendingWalRcvRestart = false;
3837 :
3838 : /*
3839 : * Launch walreceiver if needed.
3840 : *
3841 : * If fetching_ckpt is true, RecPtr points to the initial
3842 : * checkpoint location. In that case, we use RedoStartLSN
3843 : * as the streaming start position instead of RecPtr, so
3844 : * that when we later jump backwards to start redo at
3845 : * RedoStartLSN, we will have the logs streamed already.
3846 : */
3847 16032 : if (startWalReceiver &&
3848 422 : PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3849 : {
3850 : XLogRecPtr ptr;
3851 : TimeLineID tli;
3852 :
3853 342 : if (fetching_ckpt)
3854 : {
3855 0 : ptr = RedoStartLSN;
3856 0 : tli = RedoStartTLI;
3857 : }
3858 : else
3859 : {
3860 342 : ptr = RecPtr;
3861 :
3862 : /*
3863 : * Use the record begin position to determine the
3864 : * TLI, rather than the position we're reading.
3865 : */
3866 342 : tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3867 :
3868 342 : if (curFileTLI > 0 && tli < curFileTLI)
3869 0 : elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3870 : LSN_FORMAT_ARGS(tliRecPtr),
3871 : tli, curFileTLI);
3872 : }
3873 342 : curFileTLI = tli;
3874 342 : SetInstallXLogFileSegmentActive();
3875 342 : RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3876 : PrimarySlotName,
3877 : wal_receiver_create_temp_slot);
3878 342 : flushedUpto = 0;
3879 : }
3880 :
3881 : /*
3882 : * Check if WAL receiver is active or wait to start up.
3883 : */
3884 16032 : if (!WalRcvStreaming())
3885 : {
3886 274 : lastSourceFailed = true;
3887 274 : break;
3888 : }
3889 :
3890 : /*
3891 : * Walreceiver is active, so see if new data has arrived.
3892 : *
3893 : * We only advance XLogReceiptTime when we obtain fresh
3894 : * WAL from walreceiver and observe that we had already
3895 : * processed everything before the most recent "chunk"
3896 : * that it flushed to disk. In steady state where we are
3897 : * keeping up with the incoming data, XLogReceiptTime will
3898 : * be updated on each cycle. When we are behind,
3899 : * XLogReceiptTime will not advance, so the grace time
3900 : * allotted to conflicting queries will decrease.
3901 : */
3902 15758 : if (RecPtr < flushedUpto)
3903 3534 : havedata = true;
3904 : else
3905 : {
3906 : XLogRecPtr latestChunkStart;
3907 :
3908 12224 : flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3909 12224 : if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3910 : {
3911 5958 : havedata = true;
3912 5958 : if (latestChunkStart <= RecPtr)
3913 : {
3914 3976 : XLogReceiptTime = GetCurrentTimestamp();
3915 3976 : SetCurrentChunkStartTime(XLogReceiptTime);
3916 : }
3917 : }
3918 : else
3919 6266 : havedata = false;
3920 : }
3921 15758 : if (havedata)
3922 : {
3923 : /*
3924 : * Great, streamed far enough. Open the file if it's
3925 : * not open already. Also read the timeline history
3926 : * file if we haven't initialized timeline history
3927 : * yet; it should be streamed over and present in
3928 : * pg_wal by now. Use XLOG_FROM_STREAM so that source
3929 : * info is set correctly and XLogReceiptTime isn't
3930 : * changed.
3931 : *
3932 : * NB: We must set readTimeLineHistory based on
3933 : * recoveryTargetTLI, not receiveTLI. Normally they'll
3934 : * be the same, but if recovery_target_timeline is
3935 : * 'latest' and archiving is configured, then it's
3936 : * possible that we managed to retrieve one or more
3937 : * new timeline history files from the archive,
3938 : * updating recoveryTargetTLI.
3939 : */
3940 9492 : if (readFile < 0)
3941 : {
3942 2368 : if (!expectedTLEs)
3943 0 : expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3944 2368 : readFile = XLogFileRead(readSegNo, receiveTLI,
3945 : XLOG_FROM_STREAM, false);
3946 : Assert(readFile >= 0);
3947 : }
3948 : else
3949 : {
3950 : /* just make sure source info is correct... */
3951 7124 : readSource = XLOG_FROM_STREAM;
3952 7124 : XLogReceiptSource = XLOG_FROM_STREAM;
3953 7124 : return XLREAD_SUCCESS;
3954 : }
3955 2368 : break;
3956 : }
3957 :
3958 : /* In nonblocking mode, return rather than sleeping. */
3959 6266 : if (nonblocking)
3960 796 : return XLREAD_WOULDBLOCK;
3961 :
3962 : /*
3963 : * Data not here yet. Check for trigger, then wait for
3964 : * walreceiver to wake us up when new WAL arrives.
3965 : */
3966 5470 : if (CheckForStandbyTrigger())
3967 : {
3968 : /*
3969 : * Note that we don't return XLREAD_FAIL immediately
3970 : * here. After being triggered, we still want to
3971 : * replay all the WAL that was already streamed. It's
3972 : * in pg_wal now, so we just treat this as a failure,
3973 : * and the state machine will move on to replay the
3974 : * streamed WAL from pg_wal, and then recheck the
3975 : * trigger and exit replay.
3976 : */
3977 54 : lastSourceFailed = true;
3978 54 : break;
3979 : }
3980 :
3981 : /*
3982 : * Since we have replayed everything we have received so
3983 : * far and are about to start waiting for more WAL, let's
3984 : * tell the upstream server our replay location now so
3985 : * that pg_stat_replication doesn't show stale
3986 : * information.
3987 : */
3988 5416 : if (!streaming_reply_sent)
3989 : {
3990 3884 : WalRcvForceReply();
3991 3884 : streaming_reply_sent = true;
3992 : }
3993 :
3994 : /* Do any background tasks that might benefit us later. */
3995 5416 : KnownAssignedTransactionIdsIdleMaintenance();
3996 :
3997 : /* Update pg_stat_recovery_prefetch before sleeping. */
3998 5416 : XLogPrefetcherComputeStats(xlogprefetcher);
3999 :
4000 : /*
4001 : * Wait for more WAL to arrive, when we will be woken
4002 : * immediately by the WAL receiver.
4003 : */
4004 5416 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
4005 : WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
4006 : -1L,
4007 : WAIT_EVENT_RECOVERY_WAL_STREAM);
4008 5416 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4009 5416 : break;
4010 : }
4011 :
4012 0 : default:
4013 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
4014 : }
4015 :
4016 : /*
4017 : * Check for recovery pause here so that we can confirm more quickly
4018 : * that a requested pause has actually taken effect.
4019 : */
4020 8460 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4021 : RECOVERY_NOT_PAUSED)
4022 4 : recoveryPausesHere(false);
4023 :
4024 : /*
4025 : * This possibly-long loop needs to handle interrupts of startup
4026 : * process.
4027 : */
4028 8460 : ProcessStartupProcInterrupts();
4029 : }
4030 :
4031 : return XLREAD_FAIL; /* not reached */
4032 : }
4033 :
4034 :
4035 : /*
4036 : * Determine what log level should be used to report a corrupt WAL record
4037 : * in the current WAL page, previously read by XLogPageRead().
4038 : *
4039 : * 'emode' is the error mode that would be used to report a file-not-found
4040 : * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4041 : * we're retrying the exact same record that we've tried previously, only
4042 : * complain the first time to keep the noise down. However, we only do when
4043 : * reading from pg_wal, because we don't expect any invalid records in archive
4044 : * or in records streamed from the primary. Files in the archive should be complete,
4045 : * and we should never hit the end of WAL because we stop and wait for more WAL
4046 : * to arrive before replaying it.
4047 : *
4048 : * NOTE: This function remembers the RecPtr value it was last called with,
4049 : * to suppress repeated messages about the same record. Only call this when
4050 : * you are about to ereport(), or you might cause a later message to be
4051 : * erroneously suppressed.
4052 : */
4053 : static int
4054 508 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4055 : {
4056 : static XLogRecPtr lastComplaint = 0;
4057 :
4058 508 : if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4059 : {
4060 500 : if (RecPtr == lastComplaint)
4061 116 : emode = DEBUG1;
4062 : else
4063 384 : lastComplaint = RecPtr;
4064 : }
4065 508 : return emode;
4066 : }
4067 :
4068 :
4069 : /*
4070 : * Subroutine to try to fetch and validate a prior checkpoint record.
4071 : */
4072 : static XLogRecord *
4073 1896 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4074 : TimeLineID replayTLI)
4075 : {
4076 : XLogRecord *record;
4077 : uint8 info;
4078 :
4079 : Assert(xlogreader != NULL);
4080 :
4081 1896 : if (!XRecOffIsValid(RecPtr))
4082 : {
4083 0 : ereport(LOG,
4084 : (errmsg("invalid checkpoint location")));
4085 0 : return NULL;
4086 : }
4087 :
4088 1896 : XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4089 1896 : record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4090 :
4091 1896 : if (record == NULL)
4092 : {
4093 0 : ereport(LOG,
4094 : (errmsg("invalid checkpoint record")));
4095 0 : return NULL;
4096 : }
4097 1896 : if (record->xl_rmid != RM_XLOG_ID)
4098 : {
4099 0 : ereport(LOG,
4100 : (errmsg("invalid resource manager ID in checkpoint record")));
4101 0 : return NULL;
4102 : }
4103 1896 : info = record->xl_info & ~XLR_INFO_MASK;
4104 1896 : if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4105 : info != XLOG_CHECKPOINT_ONLINE)
4106 : {
4107 0 : ereport(LOG,
4108 : (errmsg("invalid xl_info in checkpoint record")));
4109 0 : return NULL;
4110 : }
4111 1896 : if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4112 : {
4113 0 : ereport(LOG,
4114 : (errmsg("invalid length of checkpoint record")));
4115 0 : return NULL;
4116 : }
4117 1896 : return record;
4118 : }
4119 :
4120 : /*
4121 : * Scan for new timelines that might have appeared in the archive since we
4122 : * started recovery.
4123 : *
4124 : * If there are any, the function changes recovery target TLI to the latest
4125 : * one and returns 'true'.
4126 : */
4127 : static bool
4128 336 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4129 : {
4130 : List *newExpectedTLEs;
4131 : bool found;
4132 : ListCell *cell;
4133 : TimeLineID newtarget;
4134 336 : TimeLineID oldtarget = recoveryTargetTLI;
4135 336 : TimeLineHistoryEntry *currentTle = NULL;
4136 :
4137 336 : newtarget = findNewestTimeLine(recoveryTargetTLI);
4138 336 : if (newtarget == recoveryTargetTLI)
4139 : {
4140 : /* No new timelines found */
4141 324 : return false;
4142 : }
4143 :
4144 : /*
4145 : * Determine the list of expected TLIs for the new TLI
4146 : */
4147 :
4148 12 : newExpectedTLEs = readTimeLineHistory(newtarget);
4149 :
4150 : /*
4151 : * If the current timeline is not part of the history of the new timeline,
4152 : * we cannot proceed to it.
4153 : */
4154 12 : found = false;
4155 24 : foreach(cell, newExpectedTLEs)
4156 : {
4157 24 : currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4158 :
4159 24 : if (currentTle->tli == recoveryTargetTLI)
4160 : {
4161 12 : found = true;
4162 12 : break;
4163 : }
4164 : }
4165 12 : if (!found)
4166 : {
4167 0 : ereport(LOG,
4168 : (errmsg("new timeline %u is not a child of database system timeline %u",
4169 : newtarget,
4170 : replayTLI)));
4171 0 : return false;
4172 : }
4173 :
4174 : /*
4175 : * The current timeline was found in the history file, but check that the
4176 : * next timeline was forked off from it *after* the current recovery
4177 : * location.
4178 : */
4179 12 : if (currentTle->end < replayLSN)
4180 : {
4181 0 : ereport(LOG,
4182 : errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4183 : newtarget,
4184 : replayTLI,
4185 : LSN_FORMAT_ARGS(replayLSN)));
4186 0 : return false;
4187 : }
4188 :
4189 : /* The new timeline history seems valid. Switch target */
4190 12 : recoveryTargetTLI = newtarget;
4191 12 : list_free_deep(expectedTLEs);
4192 12 : expectedTLEs = newExpectedTLEs;
4193 :
4194 : /*
4195 : * As in StartupXLOG(), try to ensure we have all the history files
4196 : * between the old target and new target in pg_wal.
4197 : */
4198 12 : restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4199 :
4200 12 : ereport(LOG,
4201 : (errmsg("new target timeline is %u",
4202 : recoveryTargetTLI)));
4203 :
4204 12 : return true;
4205 : }
4206 :
4207 :
4208 : /*
4209 : * Open a logfile segment for reading (during recovery).
4210 : *
4211 : * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4212 : * Otherwise, it's assumed to be already available in pg_wal.
4213 : */
4214 : static int
4215 6726 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
4216 : XLogSource source, bool notfoundOk)
4217 : {
4218 : char xlogfname[MAXFNAMELEN];
4219 : char activitymsg[MAXFNAMELEN + 16];
4220 : char path[MAXPGPATH];
4221 : int fd;
4222 :
4223 6726 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
4224 :
4225 6726 : switch (source)
4226 : {
4227 1592 : case XLOG_FROM_ARCHIVE:
4228 : /* Report recovery progress in PS display */
4229 1592 : snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4230 : xlogfname);
4231 1592 : set_ps_display(activitymsg);
4232 :
4233 1592 : if (!RestoreArchivedFile(path, xlogfname,
4234 : "RECOVERYXLOG",
4235 : wal_segment_size,
4236 : InRedo))
4237 868 : return -1;
4238 724 : break;
4239 :
4240 5134 : case XLOG_FROM_PG_WAL:
4241 : case XLOG_FROM_STREAM:
4242 5134 : XLogFilePath(path, tli, segno, wal_segment_size);
4243 5134 : break;
4244 :
4245 0 : default:
4246 0 : elog(ERROR, "invalid XLogFileRead source %d", source);
4247 : }
4248 :
4249 : /*
4250 : * If the segment was fetched from archival storage, replace the existing
4251 : * xlog segment (if any) with the archival version.
4252 : */
4253 5858 : if (source == XLOG_FROM_ARCHIVE)
4254 : {
4255 : Assert(!IsInstallXLogFileSegmentActive());
4256 724 : KeepFileRestoredFromArchive(path, xlogfname);
4257 :
4258 : /*
4259 : * Set path to point at the new file in pg_wal.
4260 : */
4261 724 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4262 : }
4263 :
4264 5858 : fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4265 5858 : if (fd >= 0)
4266 : {
4267 : /* Success! */
4268 5494 : curFileTLI = tli;
4269 :
4270 : /* Report recovery progress in PS display */
4271 5494 : snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4272 : xlogfname);
4273 5494 : set_ps_display(activitymsg);
4274 :
4275 : /* Track source of data in assorted state variables */
4276 5494 : readSource = source;
4277 5494 : XLogReceiptSource = source;
4278 : /* In FROM_STREAM case, caller tracks receipt time, not me */
4279 5494 : if (source != XLOG_FROM_STREAM)
4280 3126 : XLogReceiptTime = GetCurrentTimestamp();
4281 :
4282 5494 : return fd;
4283 : }
4284 364 : if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4285 0 : ereport(PANIC,
4286 : (errcode_for_file_access(),
4287 : errmsg("could not open file \"%s\": %m", path)));
4288 364 : return -1;
4289 : }
4290 :
4291 : /*
4292 : * Open a logfile segment for reading (during recovery).
4293 : *
4294 : * This version searches for the segment with any TLI listed in expectedTLEs.
4295 : */
4296 : static int
4297 3474 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4298 : {
4299 : char path[MAXPGPATH];
4300 : ListCell *cell;
4301 : int fd;
4302 : List *tles;
4303 :
4304 : /*
4305 : * Loop looking for a suitable timeline ID: we might need to read any of
4306 : * the timelines listed in expectedTLEs.
4307 : *
4308 : * We expect curFileTLI on entry to be the TLI of the preceding file in
4309 : * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4310 : * to go backwards; this prevents us from picking up the wrong file when a
4311 : * parent timeline extends to higher segment numbers than the child we
4312 : * want to read.
4313 : *
4314 : * If we haven't read the timeline history file yet, read it now, so that
4315 : * we know which TLIs to scan. We don't save the list in expectedTLEs,
4316 : * however, unless we actually find a valid segment. That way if there is
4317 : * neither a timeline history file nor a WAL segment in the archive, and
4318 : * streaming replication is set up, we'll read the timeline history file
4319 : * streamed from the primary when we start streaming, instead of
4320 : * recovering with a dummy history generated here.
4321 : */
4322 3474 : if (expectedTLEs)
4323 1578 : tles = expectedTLEs;
4324 : else
4325 1896 : tles = readTimeLineHistory(recoveryTargetTLI);
4326 :
4327 3852 : foreach(cell, tles)
4328 : {
4329 3512 : TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4330 3512 : TimeLineID tli = hent->tli;
4331 :
4332 3512 : if (tli < curFileTLI)
4333 8 : break; /* don't bother looking at too-old TLIs */
4334 :
4335 : /*
4336 : * Skip scanning the timeline ID that the logfile segment to read
4337 : * doesn't belong to
4338 : */
4339 3504 : if (hent->begin != InvalidXLogRecPtr)
4340 : {
4341 140 : XLogSegNo beginseg = 0;
4342 :
4343 140 : XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4344 :
4345 : /*
4346 : * The logfile segment that doesn't belong to the timeline is
4347 : * older or newer than the segment that the timeline started or
4348 : * ended at, respectively. It's sufficient to check only the
4349 : * starting segment of the timeline here. Since the timelines are
4350 : * scanned in descending order in this loop, any segments newer
4351 : * than the ending segment should belong to newer timeline and
4352 : * have already been read before. So it's not necessary to check
4353 : * the ending segment of the timeline here.
4354 : */
4355 140 : if (segno < beginseg)
4356 14 : continue;
4357 : }
4358 :
4359 3490 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4360 : {
4361 1592 : fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4362 1592 : if (fd != -1)
4363 : {
4364 724 : elog(DEBUG1, "got WAL segment from archive");
4365 724 : if (!expectedTLEs)
4366 36 : expectedTLEs = tles;
4367 3126 : return fd;
4368 : }
4369 : }
4370 :
4371 2766 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4372 : {
4373 2766 : fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4374 2766 : if (fd != -1)
4375 : {
4376 2402 : if (!expectedTLEs)
4377 1860 : expectedTLEs = tles;
4378 2402 : return fd;
4379 : }
4380 : }
4381 : }
4382 :
4383 : /* Couldn't find it. For simplicity, complain about front timeline */
4384 348 : XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4385 348 : errno = ENOENT;
4386 348 : ereport(DEBUG2,
4387 : (errcode_for_file_access(),
4388 : errmsg("could not open file \"%s\": %m", path)));
4389 348 : return -1;
4390 : }
4391 :
4392 : /*
4393 : * Set flag to signal the walreceiver to restart. (The startup process calls
4394 : * this on noticing a relevant configuration change.)
4395 : */
4396 : void
4397 10 : StartupRequestWalReceiverRestart(void)
4398 : {
4399 10 : if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4400 : {
4401 8 : ereport(LOG,
4402 : (errmsg("WAL receiver process shutdown requested")));
4403 :
4404 8 : pendingWalRcvRestart = true;
4405 : }
4406 10 : }
4407 :
4408 :
4409 : /*
4410 : * Has a standby promotion already been triggered?
4411 : *
4412 : * Unlike CheckForStandbyTrigger(), this works in any process
4413 : * that's connected to shared memory.
4414 : */
4415 : bool
4416 104 : PromoteIsTriggered(void)
4417 : {
4418 : /*
4419 : * We check shared state each time only until a standby promotion is
4420 : * triggered. We can't trigger a promotion again, so there's no need to
4421 : * keep checking after the shared variable has once been seen true.
4422 : */
4423 104 : if (LocalPromoteIsTriggered)
4424 84 : return true;
4425 :
4426 20 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4427 20 : LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4428 20 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4429 :
4430 20 : return LocalPromoteIsTriggered;
4431 : }
4432 :
4433 : static void
4434 84 : SetPromoteIsTriggered(void)
4435 : {
4436 84 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4437 84 : XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4438 84 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4439 :
4440 : /*
4441 : * Mark the recovery pause state as 'not paused' because the paused state
4442 : * ends and promotion continues if a promotion is triggered while recovery
4443 : * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4444 : * return 'paused' while a promotion is ongoing.
4445 : */
4446 84 : SetRecoveryPause(false);
4447 :
4448 84 : LocalPromoteIsTriggered = true;
4449 84 : }
4450 :
4451 : /*
4452 : * Check whether a promote request has arrived.
4453 : */
4454 : static bool
4455 6242 : CheckForStandbyTrigger(void)
4456 : {
4457 6242 : if (LocalPromoteIsTriggered)
4458 84 : return true;
4459 :
4460 6158 : if (IsPromoteSignaled() && CheckPromoteSignal())
4461 : {
4462 84 : ereport(LOG, (errmsg("received promote request")));
4463 84 : RemovePromoteSignalFiles();
4464 84 : ResetPromoteSignaled();
4465 84 : SetPromoteIsTriggered();
4466 84 : return true;
4467 : }
4468 :
4469 6074 : return false;
4470 : }
4471 :
4472 : /*
4473 : * Remove the files signaling a standby promotion request.
4474 : */
4475 : void
4476 1816 : RemovePromoteSignalFiles(void)
4477 : {
4478 1816 : unlink(PROMOTE_SIGNAL_FILE);
4479 1816 : }
4480 :
4481 : /*
4482 : * Check to see if a promote request has arrived.
4483 : */
4484 : bool
4485 1624 : CheckPromoteSignal(void)
4486 : {
4487 : struct stat stat_buf;
4488 :
4489 1624 : if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4490 168 : return true;
4491 :
4492 1456 : return false;
4493 : }
4494 :
4495 : /*
4496 : * Wake up startup process to replay newly arrived WAL, or to notice that
4497 : * failover has been requested.
4498 : */
4499 : void
4500 43706 : WakeupRecovery(void)
4501 : {
4502 43706 : SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4503 43706 : }
4504 :
4505 : /*
4506 : * Schedule a walreceiver wakeup in the main recovery loop.
4507 : */
4508 : void
4509 4 : XLogRequestWalReceiverReply(void)
4510 : {
4511 4 : doRequestWalReceiverReply = true;
4512 4 : }
4513 :
4514 : /*
4515 : * Is HotStandby active yet? This is only important in special backends
4516 : * since normal backends won't ever be able to connect until this returns
4517 : * true. Postmaster knows this by way of signal, not via shared memory.
4518 : *
4519 : * Unlike testing standbyState, this works in any process that's connected to
4520 : * shared memory. (And note that standbyState alone doesn't tell the truth
4521 : * anyway.)
4522 : */
4523 : bool
4524 312 : HotStandbyActive(void)
4525 : {
4526 : /*
4527 : * We check shared state each time only until Hot Standby is active. We
4528 : * can't de-activate Hot Standby, so there's no need to keep checking
4529 : * after the shared variable has once been seen true.
4530 : */
4531 312 : if (LocalHotStandbyActive)
4532 44 : return true;
4533 : else
4534 : {
4535 : /* spinlock is essential on machines with weak memory ordering! */
4536 268 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4537 268 : LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4538 268 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4539 :
4540 268 : return LocalHotStandbyActive;
4541 : }
4542 : }
4543 :
4544 : /*
4545 : * Like HotStandbyActive(), but to be used only in WAL replay code,
4546 : * where we don't need to ask any other process what the state is.
4547 : */
4548 : static bool
4549 0 : HotStandbyActiveInReplay(void)
4550 : {
4551 : Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4552 0 : return LocalHotStandbyActive;
4553 : }
4554 :
4555 : /*
4556 : * Get latest redo apply position.
4557 : *
4558 : * Exported to allow WALReceiver to read the pointer directly.
4559 : */
4560 : XLogRecPtr
4561 109956 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
4562 : {
4563 : XLogRecPtr recptr;
4564 : TimeLineID tli;
4565 :
4566 109956 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4567 109956 : recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4568 109956 : tli = XLogRecoveryCtl->lastReplayedTLI;
4569 109956 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4570 :
4571 109956 : if (replayTLI)
4572 4164 : *replayTLI = tli;
4573 109956 : return recptr;
4574 : }
4575 :
4576 :
4577 : /*
4578 : * Get position of last applied, or the record being applied.
4579 : *
4580 : * This is different from GetXLogReplayRecPtr() in that if a WAL
4581 : * record is currently being applied, this includes that record.
4582 : */
4583 : XLogRecPtr
4584 12018 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4585 : {
4586 : XLogRecPtr recptr;
4587 : TimeLineID tli;
4588 :
4589 12018 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4590 12018 : recptr = XLogRecoveryCtl->replayEndRecPtr;
4591 12018 : tli = XLogRecoveryCtl->replayEndTLI;
4592 12018 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4593 :
4594 12018 : if (replayEndTLI)
4595 12018 : *replayEndTLI = tli;
4596 12018 : return recptr;
4597 : }
4598 :
4599 : /*
4600 : * Save timestamp of latest processed commit/abort record.
4601 : *
4602 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4603 : * seen by processes other than the startup process. Note in particular
4604 : * that CreateRestartPoint is executed in the checkpointer.
4605 : */
4606 : static void
4607 42838 : SetLatestXTime(TimestampTz xtime)
4608 : {
4609 42838 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4610 42838 : XLogRecoveryCtl->recoveryLastXTime = xtime;
4611 42838 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4612 42838 : }
4613 :
4614 : /*
4615 : * Fetch timestamp of latest processed commit/abort record.
4616 : */
4617 : TimestampTz
4618 674 : GetLatestXTime(void)
4619 : {
4620 : TimestampTz xtime;
4621 :
4622 674 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4623 674 : xtime = XLogRecoveryCtl->recoveryLastXTime;
4624 674 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4625 :
4626 674 : return xtime;
4627 : }
4628 :
4629 : /*
4630 : * Save timestamp of the next chunk of WAL records to apply.
4631 : *
4632 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4633 : * seen by all backends.
4634 : */
4635 : static void
4636 3976 : SetCurrentChunkStartTime(TimestampTz xtime)
4637 : {
4638 3976 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4639 3976 : XLogRecoveryCtl->currentChunkStartTime = xtime;
4640 3976 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4641 3976 : }
4642 :
4643 : /*
4644 : * Fetch timestamp of latest processed commit/abort record.
4645 : * Startup process maintains an accurate local copy in XLogReceiptTime
4646 : */
4647 : TimestampTz
4648 560 : GetCurrentChunkReplayStartTime(void)
4649 : {
4650 : TimestampTz xtime;
4651 :
4652 560 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4653 560 : xtime = XLogRecoveryCtl->currentChunkStartTime;
4654 560 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4655 :
4656 560 : return xtime;
4657 : }
4658 :
4659 : /*
4660 : * Returns time of receipt of current chunk of XLOG data, as well as
4661 : * whether it was received from streaming replication or from archives.
4662 : */
4663 : void
4664 58 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4665 : {
4666 : /*
4667 : * This must be executed in the startup process, since we don't export the
4668 : * relevant state to shared memory.
4669 : */
4670 : Assert(InRecovery);
4671 :
4672 58 : *rtime = XLogReceiptTime;
4673 58 : *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4674 58 : }
4675 :
4676 : /*
4677 : * Note that text field supplied is a parameter name and does not require
4678 : * translation
4679 : */
4680 : void
4681 1220 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4682 : {
4683 1220 : if (currValue < minValue)
4684 : {
4685 0 : if (HotStandbyActiveInReplay())
4686 : {
4687 0 : bool warned_for_promote = false;
4688 :
4689 0 : ereport(WARNING,
4690 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4691 : errmsg("hot standby is not possible because of insufficient parameter settings"),
4692 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4693 : param_name,
4694 : currValue,
4695 : minValue)));
4696 :
4697 0 : SetRecoveryPause(true);
4698 :
4699 0 : ereport(LOG,
4700 : (errmsg("recovery has paused"),
4701 : errdetail("If recovery is unpaused, the server will shut down."),
4702 : errhint("You can then restart the server after making the necessary configuration changes.")));
4703 :
4704 0 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4705 : {
4706 0 : ProcessStartupProcInterrupts();
4707 :
4708 0 : if (CheckForStandbyTrigger())
4709 : {
4710 0 : if (!warned_for_promote)
4711 0 : ereport(WARNING,
4712 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4713 : errmsg("promotion is not possible because of insufficient parameter settings"),
4714 :
4715 : /*
4716 : * Repeat the detail from above so it's easy to find
4717 : * in the log.
4718 : */
4719 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4720 : param_name,
4721 : currValue,
4722 : minValue),
4723 : errhint("Restart the server after making the necessary configuration changes.")));
4724 0 : warned_for_promote = true;
4725 : }
4726 :
4727 : /*
4728 : * If recovery pause is requested then set it paused. While
4729 : * we are in the loop, user might resume and pause again so
4730 : * set this every time.
4731 : */
4732 0 : ConfirmRecoveryPaused();
4733 :
4734 : /*
4735 : * We wait on a condition variable that will wake us as soon
4736 : * as the pause ends, but we use a timeout so we can check the
4737 : * above conditions periodically too.
4738 : */
4739 0 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4740 : WAIT_EVENT_RECOVERY_PAUSE);
4741 : }
4742 0 : ConditionVariableCancelSleep();
4743 : }
4744 :
4745 0 : ereport(FATAL,
4746 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4747 : errmsg("recovery aborted because of insufficient parameter settings"),
4748 : /* Repeat the detail from above so it's easy to find in the log. */
4749 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4750 : param_name,
4751 : currValue,
4752 : minValue),
4753 : errhint("You can restart the server after making the necessary configuration changes.")));
4754 : }
4755 1220 : }
4756 :
4757 :
4758 : /*
4759 : * GUC check_hook for primary_slot_name
4760 : */
4761 : bool
4762 2648 : check_primary_slot_name(char **newval, void **extra, GucSource source)
4763 : {
4764 2648 : if (*newval && strcmp(*newval, "") != 0 &&
4765 390 : !ReplicationSlotValidateName(*newval, false, WARNING))
4766 0 : return false;
4767 :
4768 2648 : return true;
4769 : }
4770 :
4771 : /*
4772 : * Recovery target settings: Only one of the several recovery_target* settings
4773 : * may be set. Setting a second one results in an error. The global variable
4774 : * recoveryTarget tracks which kind of recovery target was chosen. Other
4775 : * variables store the actual target value (for example a string or a xid).
4776 : * The assign functions of the parameters check whether a competing parameter
4777 : * was already set. But we want to allow setting the same parameter multiple
4778 : * times. We also want to allow unsetting a parameter and setting a different
4779 : * one, so we unset recoveryTarget when the parameter is set to an empty
4780 : * string.
4781 : *
4782 : * XXX this code is broken by design. Throwing an error from a GUC assign
4783 : * hook breaks fundamental assumptions of guc.c. So long as all the variables
4784 : * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4785 : * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4786 : * that we have odd behaviors such as unexpected GUC ordering dependencies.
4787 : */
4788 :
4789 : pg_noreturn static void
4790 2 : error_multiple_recovery_targets(void)
4791 : {
4792 2 : ereport(ERROR,
4793 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4794 : errmsg("multiple recovery targets specified"),
4795 : errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4796 : }
4797 :
4798 : /*
4799 : * GUC check_hook for recovery_target
4800 : */
4801 : bool
4802 2260 : check_recovery_target(char **newval, void **extra, GucSource source)
4803 : {
4804 2260 : if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4805 : {
4806 0 : GUC_check_errdetail("The only allowed value is \"immediate\".");
4807 0 : return false;
4808 : }
4809 2260 : return true;
4810 : }
4811 :
4812 : /*
4813 : * GUC assign_hook for recovery_target
4814 : */
4815 : void
4816 2260 : assign_recovery_target(const char *newval, void *extra)
4817 : {
4818 2260 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4819 0 : recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4820 0 : error_multiple_recovery_targets();
4821 :
4822 2260 : if (newval && strcmp(newval, "") != 0)
4823 2 : recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4824 : else
4825 2258 : recoveryTarget = RECOVERY_TARGET_UNSET;
4826 2260 : }
4827 :
4828 : /*
4829 : * GUC check_hook for recovery_target_lsn
4830 : */
4831 : bool
4832 2272 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4833 : {
4834 2272 : if (strcmp(*newval, "") != 0)
4835 : {
4836 : XLogRecPtr lsn;
4837 : XLogRecPtr *myextra;
4838 18 : ErrorSaveContext escontext = {T_ErrorSaveContext};
4839 :
4840 18 : lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
4841 18 : if (escontext.error_occurred)
4842 0 : return false;
4843 :
4844 18 : myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4845 18 : if (!myextra)
4846 0 : return false;
4847 18 : *myextra = lsn;
4848 18 : *extra = myextra;
4849 : }
4850 2272 : return true;
4851 : }
4852 :
4853 : /*
4854 : * GUC assign_hook for recovery_target_lsn
4855 : */
4856 : void
4857 2272 : assign_recovery_target_lsn(const char *newval, void *extra)
4858 : {
4859 2272 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4860 0 : recoveryTarget != RECOVERY_TARGET_LSN)
4861 0 : error_multiple_recovery_targets();
4862 :
4863 2272 : if (newval && strcmp(newval, "") != 0)
4864 : {
4865 18 : recoveryTarget = RECOVERY_TARGET_LSN;
4866 18 : recoveryTargetLSN = *((XLogRecPtr *) extra);
4867 : }
4868 : else
4869 2254 : recoveryTarget = RECOVERY_TARGET_UNSET;
4870 2272 : }
4871 :
4872 : /*
4873 : * GUC check_hook for recovery_target_name
4874 : */
4875 : bool
4876 2272 : check_recovery_target_name(char **newval, void **extra, GucSource source)
4877 : {
4878 : /* Use the value of newval directly */
4879 2272 : if (strlen(*newval) >= MAXFNAMELEN)
4880 : {
4881 0 : GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4882 : "recovery_target_name", MAXFNAMELEN - 1);
4883 0 : return false;
4884 : }
4885 2272 : return true;
4886 : }
4887 :
4888 : /*
4889 : * GUC assign_hook for recovery_target_name
4890 : */
4891 : void
4892 2272 : assign_recovery_target_name(const char *newval, void *extra)
4893 : {
4894 2272 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4895 0 : recoveryTarget != RECOVERY_TARGET_NAME)
4896 0 : error_multiple_recovery_targets();
4897 :
4898 2272 : if (newval && strcmp(newval, "") != 0)
4899 : {
4900 12 : recoveryTarget = RECOVERY_TARGET_NAME;
4901 12 : recoveryTargetName = newval;
4902 : }
4903 : else
4904 2260 : recoveryTarget = RECOVERY_TARGET_UNSET;
4905 2272 : }
4906 :
4907 : /*
4908 : * GUC check_hook for recovery_target_time
4909 : *
4910 : * The interpretation of the recovery_target_time string can depend on the
4911 : * time zone setting, so we need to wait until after all GUC processing is
4912 : * done before we can do the final parsing of the string. This check function
4913 : * only does a parsing pass to catch syntax errors, but we store the string
4914 : * and parse it again when we need to use it.
4915 : */
4916 : bool
4917 2264 : check_recovery_target_time(char **newval, void **extra, GucSource source)
4918 : {
4919 2264 : if (strcmp(*newval, "") != 0)
4920 : {
4921 : /* reject some special values */
4922 6 : if (strcmp(*newval, "now") == 0 ||
4923 6 : strcmp(*newval, "today") == 0 ||
4924 6 : strcmp(*newval, "tomorrow") == 0 ||
4925 6 : strcmp(*newval, "yesterday") == 0)
4926 : {
4927 0 : return false;
4928 : }
4929 :
4930 : /*
4931 : * parse timestamp value (see also timestamptz_in())
4932 : */
4933 : {
4934 6 : char *str = *newval;
4935 : fsec_t fsec;
4936 : struct pg_tm tt,
4937 6 : *tm = &tt;
4938 : int tz;
4939 : int dtype;
4940 : int nf;
4941 : int dterr;
4942 : char *field[MAXDATEFIELDS];
4943 : int ftype[MAXDATEFIELDS];
4944 : char workbuf[MAXDATELEN + MAXDATEFIELDS];
4945 : DateTimeErrorExtra dtextra;
4946 : TimestampTz timestamp;
4947 :
4948 6 : dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4949 : field, ftype, MAXDATEFIELDS, &nf);
4950 6 : if (dterr == 0)
4951 6 : dterr = DecodeDateTime(field, ftype, nf,
4952 : &dtype, tm, &fsec, &tz, &dtextra);
4953 6 : if (dterr != 0)
4954 0 : return false;
4955 6 : if (dtype != DTK_DATE)
4956 0 : return false;
4957 :
4958 6 : if (tm2timestamp(tm, fsec, &tz, ×tamp) != 0)
4959 : {
4960 0 : GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4961 0 : return false;
4962 : }
4963 : }
4964 : }
4965 2264 : return true;
4966 : }
4967 :
4968 : /*
4969 : * GUC assign_hook for recovery_target_time
4970 : */
4971 : void
4972 2264 : assign_recovery_target_time(const char *newval, void *extra)
4973 : {
4974 2264 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4975 2 : recoveryTarget != RECOVERY_TARGET_TIME)
4976 2 : error_multiple_recovery_targets();
4977 :
4978 2262 : if (newval && strcmp(newval, "") != 0)
4979 4 : recoveryTarget = RECOVERY_TARGET_TIME;
4980 : else
4981 2258 : recoveryTarget = RECOVERY_TARGET_UNSET;
4982 2262 : }
4983 :
4984 : /*
4985 : * GUC check_hook for recovery_target_timeline
4986 : */
4987 : bool
4988 2266 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
4989 : {
4990 : RecoveryTargetTimeLineGoal rttg;
4991 : RecoveryTargetTimeLineGoal *myextra;
4992 :
4993 2266 : if (strcmp(*newval, "current") == 0)
4994 0 : rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
4995 2266 : else if (strcmp(*newval, "latest") == 0)
4996 2260 : rttg = RECOVERY_TARGET_TIMELINE_LATEST;
4997 : else
4998 : {
4999 : char *endp;
5000 : uint64 timeline;
5001 :
5002 6 : rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
5003 :
5004 6 : errno = 0;
5005 6 : timeline = strtou64(*newval, &endp, 0);
5006 :
5007 6 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5008 : {
5009 2 : GUC_check_errdetail("\"%s\" is not a valid number.",
5010 : "recovery_target_timeline");
5011 6 : return false;
5012 : }
5013 :
5014 4 : if (timeline < 1 || timeline > PG_UINT32_MAX)
5015 : {
5016 4 : GUC_check_errdetail("\"%s\" must be between %u and %u.",
5017 : "recovery_target_timeline", 1, UINT_MAX);
5018 4 : return false;
5019 : }
5020 : }
5021 :
5022 2260 : myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal));
5023 2260 : if (!myextra)
5024 0 : return false;
5025 2260 : *myextra = rttg;
5026 2260 : *extra = myextra;
5027 :
5028 2260 : return true;
5029 : }
5030 :
5031 : /*
5032 : * GUC assign_hook for recovery_target_timeline
5033 : */
5034 : void
5035 2260 : assign_recovery_target_timeline(const char *newval, void *extra)
5036 : {
5037 2260 : recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5038 2260 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5039 0 : recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5040 : else
5041 2260 : recoveryTargetTLIRequested = 0;
5042 2260 : }
5043 :
5044 : /*
5045 : * GUC check_hook for recovery_target_xid
5046 : */
5047 : bool
5048 2260 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
5049 : {
5050 2260 : if (strcmp(*newval, "") != 0)
5051 : {
5052 : TransactionId xid;
5053 : TransactionId *myextra;
5054 :
5055 2 : errno = 0;
5056 2 : xid = (TransactionId) strtou64(*newval, NULL, 0);
5057 2 : if (errno == EINVAL || errno == ERANGE)
5058 0 : return false;
5059 :
5060 2 : myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5061 2 : if (!myextra)
5062 0 : return false;
5063 2 : *myextra = xid;
5064 2 : *extra = myextra;
5065 : }
5066 2260 : return true;
5067 : }
5068 :
5069 : /*
5070 : * GUC assign_hook for recovery_target_xid
5071 : */
5072 : void
5073 2260 : assign_recovery_target_xid(const char *newval, void *extra)
5074 : {
5075 2260 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5076 0 : recoveryTarget != RECOVERY_TARGET_XID)
5077 0 : error_multiple_recovery_targets();
5078 :
5079 2260 : if (newval && strcmp(newval, "") != 0)
5080 : {
5081 2 : recoveryTarget = RECOVERY_TARGET_XID;
5082 2 : recoveryTargetXid = *((TransactionId *) extra);
5083 : }
5084 : else
5085 2258 : recoveryTarget = RECOVERY_TARGET_UNSET;
5086 2260 : }
|