Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlogrecovery.c
4 : * Functions for WAL recovery, standby mode
5 : *
6 : * This source file contains functions controlling WAL recovery.
7 : * InitWalRecovery() initializes the system for crash or archive recovery,
8 : * or standby mode, depending on configuration options and the state of
9 : * the control file and possible backup label file. PerformWalRecovery()
10 : * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 : * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 : * and prepares information needed to initialize the WAL for writes. In
13 : * addition to these three main functions, there are a bunch of functions
14 : * for interrogating recovery state and controlling the recovery process.
15 : *
16 : *
17 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
18 : * Portions Copyright (c) 1994, Regents of the University of California
19 : *
20 : * src/backend/access/transam/xlogrecovery.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 :
25 : #include "postgres.h"
26 :
27 : #include <ctype.h>
28 : #include <math.h>
29 : #include <time.h>
30 : #include <sys/stat.h>
31 : #include <sys/time.h>
32 : #include <unistd.h>
33 :
34 : #include "access/timeline.h"
35 : #include "access/transam.h"
36 : #include "access/xact.h"
37 : #include "access/xlog_internal.h"
38 : #include "access/xlogarchive.h"
39 : #include "access/xlogprefetcher.h"
40 : #include "access/xlogreader.h"
41 : #include "access/xlogrecovery.h"
42 : #include "access/xlogutils.h"
43 : #include "access/xlogwait.h"
44 : #include "backup/basebackup.h"
45 : #include "catalog/pg_control.h"
46 : #include "commands/tablespace.h"
47 : #include "common/file_utils.h"
48 : #include "miscadmin.h"
49 : #include "nodes/miscnodes.h"
50 : #include "pgstat.h"
51 : #include "postmaster/bgwriter.h"
52 : #include "postmaster/startup.h"
53 : #include "replication/slot.h"
54 : #include "replication/slotsync.h"
55 : #include "replication/walreceiver.h"
56 : #include "storage/fd.h"
57 : #include "storage/ipc.h"
58 : #include "storage/latch.h"
59 : #include "storage/pmsignal.h"
60 : #include "storage/procarray.h"
61 : #include "storage/spin.h"
62 : #include "utils/datetime.h"
63 : #include "utils/fmgrprotos.h"
64 : #include "utils/guc_hooks.h"
65 : #include "utils/pgstat_internal.h"
66 : #include "utils/pg_lsn.h"
67 : #include "utils/ps_status.h"
68 : #include "utils/pg_rusage.h"
69 :
70 : /* Unsupported old recovery command file names (relative to $PGDATA) */
71 : #define RECOVERY_COMMAND_FILE "recovery.conf"
72 : #define RECOVERY_COMMAND_DONE "recovery.done"
73 :
74 : /*
75 : * GUC support
76 : */
77 : const struct config_enum_entry recovery_target_action_options[] = {
78 : {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
79 : {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
80 : {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
81 : {NULL, 0, false}
82 : };
83 :
84 : /* options formerly taken from recovery.conf for archive recovery */
85 : char *recoveryRestoreCommand = NULL;
86 : char *recoveryEndCommand = NULL;
87 : char *archiveCleanupCommand = NULL;
88 : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
89 : bool recoveryTargetInclusive = true;
90 : int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
91 : TransactionId recoveryTargetXid;
92 : char *recovery_target_time_string;
93 : TimestampTz recoveryTargetTime;
94 : const char *recoveryTargetName;
95 : XLogRecPtr recoveryTargetLSN;
96 : int recovery_min_apply_delay = 0;
97 :
98 : /* options formerly taken from recovery.conf for XLOG streaming */
99 : char *PrimaryConnInfo = NULL;
100 : char *PrimarySlotName = NULL;
101 : bool wal_receiver_create_temp_slot = false;
102 :
103 : /*
104 : * recoveryTargetTimeLineGoal: what the user requested, if any
105 : *
106 : * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
107 : *
108 : * recoveryTargetTLI: the currently understood target timeline; changes
109 : *
110 : * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
111 : * the timelines of its known parents, newest first (so recoveryTargetTLI is
112 : * always the first list member). Only these TLIs are expected to be seen in
113 : * the WAL segments we read, and indeed only these TLIs will be considered as
114 : * candidate WAL files to open at all.
115 : *
116 : * curFileTLI: the TLI appearing in the name of the current input WAL file.
117 : * (This is not necessarily the same as the timeline from which we are
118 : * replaying WAL, which StartupXLOG calls replayTLI, because we could be
119 : * scanning data that was copied from an ancestor timeline when the current
120 : * file was created.) During a sequential scan we do not allow this value
121 : * to decrease.
122 : */
123 : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
124 : TimeLineID recoveryTargetTLIRequested = 0;
125 : TimeLineID recoveryTargetTLI = 0;
126 : static List *expectedTLEs;
127 : static TimeLineID curFileTLI;
128 :
129 : /*
130 : * When ArchiveRecoveryRequested is set, archive recovery was requested,
131 : * ie. signal files were present. When InArchiveRecovery is set, we are
132 : * currently recovering using offline XLOG archives. These variables are only
133 : * valid in the startup process.
134 : *
135 : * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
136 : * currently performing crash recovery using only XLOG files in pg_wal, but
137 : * will switch to using offline XLOG archives as soon as we reach the end of
138 : * WAL in pg_wal.
139 : */
140 : bool ArchiveRecoveryRequested = false;
141 : bool InArchiveRecovery = false;
142 :
143 : /*
144 : * When StandbyModeRequested is set, standby mode was requested, i.e.
145 : * standby.signal file was present. When StandbyMode is set, we are currently
146 : * in standby mode. These variables are only valid in the startup process.
147 : * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
148 : */
149 : static bool StandbyModeRequested = false;
150 : bool StandbyMode = false;
151 :
152 : /* was a signal file present at startup? */
153 : static bool standby_signal_file_found = false;
154 : static bool recovery_signal_file_found = false;
155 :
156 : /*
157 : * CheckPointLoc is the position of the checkpoint record that determines
158 : * where to start the replay. It comes from the backup label file or the
159 : * control file.
160 : *
161 : * RedoStartLSN is the checkpoint's REDO location, also from the backup label
162 : * file or the control file. In standby mode, XLOG streaming usually starts
163 : * from the position where an invalid record was found. But if we fail to
164 : * read even the initial checkpoint record, we use the REDO location instead
165 : * of the checkpoint location as the start position of XLOG streaming.
166 : * Otherwise we would have to jump backwards to the REDO location after
167 : * reading the checkpoint record, because the REDO record can precede the
168 : * checkpoint record.
169 : */
170 : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
171 : static TimeLineID CheckPointTLI = 0;
172 : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
173 : static TimeLineID RedoStartTLI = 0;
174 :
175 : /*
176 : * Local copy of SharedHotStandbyActive variable. False actually means "not
177 : * known, need to check the shared state".
178 : */
179 : static bool LocalHotStandbyActive = false;
180 :
181 : /*
182 : * Local copy of SharedPromoteIsTriggered variable. False actually means "not
183 : * known, need to check the shared state".
184 : */
185 : static bool LocalPromoteIsTriggered = false;
186 :
187 : /* Has the recovery code requested a walreceiver wakeup? */
188 : static bool doRequestWalReceiverReply;
189 :
190 : /* XLogReader object used to parse the WAL records */
191 : static XLogReaderState *xlogreader = NULL;
192 :
193 : /* XLogPrefetcher object used to consume WAL records with read-ahead */
194 : static XLogPrefetcher *xlogprefetcher = NULL;
195 :
196 : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
197 : typedef struct XLogPageReadPrivate
198 : {
199 : int emode;
200 : bool fetching_ckpt; /* are we fetching a checkpoint record? */
201 : bool randAccess;
202 : TimeLineID replayTLI;
203 : } XLogPageReadPrivate;
204 :
205 : /* flag to tell XLogPageRead that we have started replaying */
206 : static bool InRedo = false;
207 :
208 : /*
209 : * Codes indicating where we got a WAL file from during recovery, or where
210 : * to attempt to get one.
211 : */
212 : typedef enum
213 : {
214 : XLOG_FROM_ANY = 0, /* request to read WAL from any source */
215 : XLOG_FROM_ARCHIVE, /* restored using restore_command */
216 : XLOG_FROM_PG_WAL, /* existing file in pg_wal */
217 : XLOG_FROM_STREAM, /* streamed from primary */
218 : } XLogSource;
219 :
220 : /* human-readable names for XLogSources, for debugging output */
221 : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
222 :
223 : /*
224 : * readFile is -1 or a kernel FD for the log file segment that's currently
225 : * open for reading. readSegNo identifies the segment. readOff is the offset
226 : * of the page just read, readLen indicates how much of it has been read into
227 : * readBuf, and readSource indicates where we got the currently open file from.
228 : *
229 : * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
230 : * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
231 : * worthwhile, since the XLOG is not read by general-purpose sessions.
232 : */
233 : static int readFile = -1;
234 : static XLogSegNo readSegNo = 0;
235 : static uint32 readOff = 0;
236 : static uint32 readLen = 0;
237 : static XLogSource readSource = XLOG_FROM_ANY;
238 :
239 : /*
240 : * Keeps track of which source we're currently reading from. This is
241 : * different from readSource in that this is always set, even when we don't
242 : * currently have a WAL file open. If lastSourceFailed is set, our last
243 : * attempt to read from currentSource failed, and we should try another source
244 : * next.
245 : *
246 : * pendingWalRcvRestart is set when a config change occurs that requires a
247 : * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
248 : */
249 : static XLogSource currentSource = XLOG_FROM_ANY;
250 : static bool lastSourceFailed = false;
251 : static bool pendingWalRcvRestart = false;
252 :
253 : /*
254 : * These variables track when we last obtained some WAL data to process,
255 : * and where we got it from. (XLogReceiptSource is initially the same as
256 : * readSource, but readSource gets reset to zero when we don't have data
257 : * to process right now. It is also different from currentSource, which
258 : * also changes when we try to read from a source and fail, while
259 : * XLogReceiptSource tracks where we last successfully read some WAL.)
260 : */
261 : static TimestampTz XLogReceiptTime = 0;
262 : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
263 :
264 : /* Local copy of WalRcv->flushedUpto */
265 : static XLogRecPtr flushedUpto = 0;
266 : static TimeLineID receiveTLI = 0;
267 :
268 : /*
269 : * Copy of minRecoveryPoint and backupEndPoint from the control file.
270 : *
271 : * In order to reach consistency, we must replay the WAL up to
272 : * minRecoveryPoint. If backupEndRequired is true, we must also reach
273 : * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
274 : * to backupStartPoint.
275 : *
276 : * Note: In archive recovery, after consistency has been reached, the
277 : * functions in xlog.c will start updating minRecoveryPoint in the control
278 : * file. But this copy of minRecoveryPoint variable reflects the value at the
279 : * beginning of recovery, and is *not* updated after consistency is reached.
280 : */
281 : static XLogRecPtr minRecoveryPoint;
282 : static TimeLineID minRecoveryPointTLI;
283 :
284 : static XLogRecPtr backupStartPoint;
285 : static XLogRecPtr backupEndPoint;
286 : static bool backupEndRequired = false;
287 :
288 : /*
289 : * Have we reached a consistent database state? In crash recovery, we have
290 : * to replay all the WAL, so reachedConsistency is never set. During archive
291 : * recovery, the database is consistent once minRecoveryPoint is reached.
292 : *
293 : * Consistent state means that the system is internally consistent, all
294 : * the WAL has been replayed up to a certain point, and importantly, there
295 : * is no trace of later actions on disk.
296 : *
297 : * This flag is used only by the startup process and postmaster. When
298 : * minRecoveryPoint is reached, the startup process sets it to true and
299 : * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
300 : * which then sets it to true upon receiving the signal.
301 : */
302 : bool reachedConsistency = false;
303 :
304 : /* Buffers dedicated to consistency checks of size BLCKSZ */
305 : static char *replay_image_masked = NULL;
306 : static char *primary_image_masked = NULL;
307 :
308 :
309 : /*
310 : * Shared-memory state for WAL recovery.
311 : */
312 : typedef struct XLogRecoveryCtlData
313 : {
314 : /*
315 : * SharedHotStandbyActive indicates if we allow hot standby queries to be
316 : * run. Protected by info_lck.
317 : */
318 : bool SharedHotStandbyActive;
319 :
320 : /*
321 : * SharedPromoteIsTriggered indicates if a standby promotion has been
322 : * triggered. Protected by info_lck.
323 : */
324 : bool SharedPromoteIsTriggered;
325 :
326 : /*
327 : * recoveryWakeupLatch is used to wake up the startup process to continue
328 : * WAL replay, if it is waiting for WAL to arrive or promotion to be
329 : * requested.
330 : *
331 : * Note that the startup process also uses another latch, its procLatch,
332 : * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
333 : * signaling the startup process in favor of using its procLatch, which
334 : * comports better with possible generic signal handlers using that latch.
335 : * But we should not do that because the startup process doesn't assume
336 : * that it's waken up by walreceiver process or SIGHUP signal handler
337 : * while it's waiting for recovery conflict. The separate latches,
338 : * recoveryWakeupLatch and procLatch, should be used for inter-process
339 : * communication for WAL replay and recovery conflict, respectively.
340 : */
341 : Latch recoveryWakeupLatch;
342 :
343 : /*
344 : * Last record successfully replayed.
345 : */
346 : XLogRecPtr lastReplayedReadRecPtr; /* start position */
347 : XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
348 : TimeLineID lastReplayedTLI; /* timeline */
349 :
350 : /*
351 : * When we're currently replaying a record, ie. in a redo function,
352 : * replayEndRecPtr points to the end+1 of the record being replayed,
353 : * otherwise it's equal to lastReplayedEndRecPtr.
354 : */
355 : XLogRecPtr replayEndRecPtr;
356 : TimeLineID replayEndTLI;
357 : /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
358 : TimestampTz recoveryLastXTime;
359 :
360 : /*
361 : * timestamp of when we started replaying the current chunk of WAL data,
362 : * only relevant for replication or archive recovery
363 : */
364 : TimestampTz currentChunkStartTime;
365 : /* Recovery pause state */
366 : RecoveryPauseState recoveryPauseState;
367 : ConditionVariable recoveryNotPausedCV;
368 :
369 : slock_t info_lck; /* locks shared variables shown above */
370 : } XLogRecoveryCtlData;
371 :
372 : static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
373 :
374 : /*
375 : * abortedRecPtr is the start pointer of a broken record at end of WAL when
376 : * recovery completes; missingContrecPtr is the location of the first
377 : * contrecord that went missing. See CreateOverwriteContrecordRecord for
378 : * details.
379 : */
380 : static XLogRecPtr abortedRecPtr;
381 : static XLogRecPtr missingContrecPtr;
382 :
383 : /*
384 : * if recoveryStopsBefore/After returns true, it saves information of the stop
385 : * point here
386 : */
387 : static TransactionId recoveryStopXid;
388 : static TimestampTz recoveryStopTime;
389 : static XLogRecPtr recoveryStopLSN;
390 : static char recoveryStopName[MAXFNAMELEN];
391 : static bool recoveryStopAfter;
392 :
393 : /* prototypes for local functions */
394 : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
395 :
396 : static void EnableStandbyMode(void);
397 : static void readRecoverySignalFile(void);
398 : static void validateRecoveryParameters(void);
399 : static bool read_backup_label(XLogRecPtr *checkPointLoc,
400 : TimeLineID *backupLabelTLI,
401 : bool *backupEndRequired, bool *backupFromStandby);
402 : static bool read_tablespace_map(List **tablespaces);
403 :
404 : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
405 : static void CheckRecoveryConsistency(void);
406 : static void rm_redo_error_callback(void *arg);
407 : #ifdef WAL_DEBUG
408 : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
409 : #endif
410 : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
411 : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
412 : TimeLineID prevTLI, TimeLineID replayTLI);
413 : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
414 : static void verifyBackupPageConsistency(XLogReaderState *record);
415 :
416 : static bool recoveryStopsBefore(XLogReaderState *record);
417 : static bool recoveryStopsAfter(XLogReaderState *record);
418 : static char *getRecoveryStopReason(void);
419 : static void recoveryPausesHere(bool endOfRecovery);
420 : static bool recoveryApplyDelay(XLogReaderState *record);
421 : static void ConfirmRecoveryPaused(void);
422 :
423 : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
424 : int emode, bool fetching_ckpt,
425 : TimeLineID replayTLI);
426 :
427 : static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
428 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
429 : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
430 : bool randAccess,
431 : bool fetching_ckpt,
432 : XLogRecPtr tliRecPtr,
433 : TimeLineID replayTLI,
434 : XLogRecPtr replayLSN,
435 : bool nonblocking);
436 : static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
437 : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
438 : XLogRecPtr RecPtr, TimeLineID replayTLI);
439 : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
440 : static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
441 : XLogSource source, bool notfoundOk);
442 : static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
443 :
444 : static bool CheckForStandbyTrigger(void);
445 : static void SetPromoteIsTriggered(void);
446 : static bool HotStandbyActiveInReplay(void);
447 :
448 : static void SetCurrentChunkStartTime(TimestampTz xtime);
449 : static void SetLatestXTime(TimestampTz xtime);
450 :
451 : /*
452 : * Initialization of shared memory for WAL recovery
453 : */
454 : Size
455 6300 : XLogRecoveryShmemSize(void)
456 : {
457 : Size size;
458 :
459 : /* XLogRecoveryCtl */
460 6300 : size = sizeof(XLogRecoveryCtlData);
461 :
462 6300 : return size;
463 : }
464 :
465 : void
466 2200 : XLogRecoveryShmemInit(void)
467 : {
468 : bool found;
469 :
470 2200 : XLogRecoveryCtl = (XLogRecoveryCtlData *)
471 2200 : ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
472 2200 : if (found)
473 0 : return;
474 2200 : memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
475 :
476 2200 : SpinLockInit(&XLogRecoveryCtl->info_lck);
477 2200 : InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
478 2200 : ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
479 : }
480 :
481 : /*
482 : * A thin wrapper to enable StandbyMode and do other preparatory work as
483 : * needed.
484 : */
485 : static void
486 210 : EnableStandbyMode(void)
487 : {
488 210 : StandbyMode = true;
489 :
490 : /*
491 : * To avoid server log bloat, we don't report recovery progress in a
492 : * standby as it will always be in recovery unless promoted. We disable
493 : * startup progress timeout in standby mode to avoid calling
494 : * startup_progress_timeout_handler() unnecessarily.
495 : */
496 210 : disable_startup_progress_timeout();
497 210 : }
498 :
499 : /*
500 : * Prepare the system for WAL recovery, if needed.
501 : *
502 : * This is called by StartupXLOG() which coordinates the server startup
503 : * sequence. This function analyzes the control file and the backup label
504 : * file, if any, and figures out whether we need to perform crash recovery or
505 : * archive recovery, and how far we need to replay the WAL to reach a
506 : * consistent state.
507 : *
508 : * This doesn't yet change the on-disk state, except for creating the symlinks
509 : * from table space map file if any, and for fetching WAL files needed to find
510 : * the checkpoint record. On entry, the caller has already read the control
511 : * file into memory, and passes it as argument. This function updates it to
512 : * reflect the recovery state, and the caller is expected to write it back to
513 : * disk does after initializing other subsystems, but before calling
514 : * PerformWalRecovery().
515 : *
516 : * This initializes some global variables like ArchiveRecoveryRequested, and
517 : * StandbyModeRequested and InRecovery.
518 : */
519 : void
520 1916 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
521 : bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
522 : {
523 : XLogPageReadPrivate *private;
524 : struct stat st;
525 : bool wasShutdown;
526 : XLogRecord *record;
527 : DBState dbstate_at_startup;
528 1916 : bool haveTblspcMap = false;
529 1916 : bool haveBackupLabel = false;
530 : CheckPoint checkPoint;
531 1916 : bool backupFromStandby = false;
532 :
533 1916 : dbstate_at_startup = ControlFile->state;
534 :
535 : /*
536 : * Initialize on the assumption we want to recover to the latest timeline
537 : * that's active according to pg_control.
538 : */
539 1916 : if (ControlFile->minRecoveryPointTLI >
540 1916 : ControlFile->checkPointCopy.ThisTimeLineID)
541 4 : recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
542 : else
543 1912 : recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
544 :
545 : /*
546 : * Check for signal files, and if so set up state for offline recovery
547 : */
548 1916 : readRecoverySignalFile();
549 1916 : validateRecoveryParameters();
550 :
551 : /*
552 : * Take ownership of the wakeup latch if we're going to sleep during
553 : * recovery, if required.
554 : */
555 1916 : if (ArchiveRecoveryRequested)
556 220 : OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
557 :
558 : /*
559 : * Set the WAL reading processor now, as it will be needed when reading
560 : * the checkpoint record required (backup_label or not).
561 : */
562 1916 : private = palloc0(sizeof(XLogPageReadPrivate));
563 1916 : xlogreader =
564 1916 : XLogReaderAllocate(wal_segment_size, NULL,
565 1916 : XL_ROUTINE(.page_read = &XLogPageRead,
566 : .segment_open = NULL,
567 : .segment_close = wal_segment_close),
568 : private);
569 1916 : if (!xlogreader)
570 0 : ereport(ERROR,
571 : (errcode(ERRCODE_OUT_OF_MEMORY),
572 : errmsg("out of memory"),
573 : errdetail("Failed while allocating a WAL reading processor.")));
574 1916 : xlogreader->system_identifier = ControlFile->system_identifier;
575 :
576 : /*
577 : * Set the WAL decode buffer size. This limits how far ahead we can read
578 : * in the WAL.
579 : */
580 1916 : XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
581 :
582 : /* Create a WAL prefetcher. */
583 1916 : xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
584 :
585 : /*
586 : * Allocate two page buffers dedicated to WAL consistency checks. We do
587 : * it this way, rather than just making static arrays, for two reasons:
588 : * (1) no need to waste the storage in most instantiations of the backend;
589 : * (2) a static char array isn't guaranteed to have any particular
590 : * alignment, whereas palloc() will provide MAXALIGN'd storage.
591 : */
592 1916 : replay_image_masked = (char *) palloc(BLCKSZ);
593 1916 : primary_image_masked = (char *) palloc(BLCKSZ);
594 :
595 : /*
596 : * Read the backup_label file. We want to run this part of the recovery
597 : * process after checking for signal files and after performing validation
598 : * of the recovery parameters.
599 : */
600 1916 : if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
601 : &backupFromStandby))
602 : {
603 146 : List *tablespaces = NIL;
604 :
605 : /*
606 : * Archive recovery was requested, and thanks to the backup label
607 : * file, we know how far we need to replay to reach consistency. Enter
608 : * archive recovery directly.
609 : */
610 146 : InArchiveRecovery = true;
611 146 : if (StandbyModeRequested)
612 124 : EnableStandbyMode();
613 :
614 : /*
615 : * Omitting backup_label when creating a new replica, PITR node etc.
616 : * unfortunately is a common cause of corruption. Logging that
617 : * backup_label was used makes it a bit easier to exclude that as the
618 : * cause of observed corruption.
619 : *
620 : * Do so before we try to read the checkpoint record (which can fail),
621 : * as otherwise it can be hard to understand why a checkpoint other
622 : * than ControlFile->checkPoint is used.
623 : */
624 146 : ereport(LOG,
625 : errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
626 : LSN_FORMAT_ARGS(RedoStartLSN),
627 : LSN_FORMAT_ARGS(CheckPointLoc),
628 : CheckPointTLI));
629 :
630 : /*
631 : * When a backup_label file is present, we want to roll forward from
632 : * the checkpoint it identifies, rather than using pg_control.
633 : */
634 146 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
635 : CheckPointTLI);
636 146 : if (record != NULL)
637 : {
638 146 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
639 146 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
640 146 : ereport(DEBUG1,
641 : errmsg_internal("checkpoint record is at %X/%08X",
642 : LSN_FORMAT_ARGS(CheckPointLoc)));
643 146 : InRecovery = true; /* force recovery even if SHUTDOWNED */
644 :
645 : /*
646 : * Make sure that REDO location exists. This may not be the case
647 : * if there was a crash during an online backup, which left a
648 : * backup_label around that references a WAL segment that's
649 : * already been archived.
650 : */
651 146 : if (checkPoint.redo < CheckPointLoc)
652 : {
653 146 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
654 146 : if (!ReadRecord(xlogprefetcher, LOG, false,
655 : checkPoint.ThisTimeLineID))
656 0 : ereport(FATAL,
657 : errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
658 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
659 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
660 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
661 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
662 : DataDir, DataDir, DataDir, DataDir));
663 : }
664 : }
665 : else
666 : {
667 0 : ereport(FATAL,
668 : errmsg("could not locate required checkpoint record at %X/%08X",
669 : LSN_FORMAT_ARGS(CheckPointLoc)),
670 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
671 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
672 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
673 : DataDir, DataDir, DataDir, DataDir));
674 : wasShutdown = false; /* keep compiler quiet */
675 : }
676 :
677 : /* Read the tablespace_map file if present and create symlinks. */
678 146 : if (read_tablespace_map(&tablespaces))
679 : {
680 : ListCell *lc;
681 :
682 8 : foreach(lc, tablespaces)
683 : {
684 4 : tablespaceinfo *ti = lfirst(lc);
685 : char *linkloc;
686 :
687 4 : linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
688 :
689 : /*
690 : * Remove the existing symlink if any and Create the symlink
691 : * under PGDATA.
692 : */
693 4 : remove_tablespace_symlink(linkloc);
694 :
695 4 : if (symlink(ti->path, linkloc) < 0)
696 0 : ereport(ERROR,
697 : (errcode_for_file_access(),
698 : errmsg("could not create symbolic link \"%s\": %m",
699 : linkloc)));
700 :
701 4 : pfree(ti->path);
702 4 : pfree(ti);
703 : }
704 :
705 : /* tell the caller to delete it later */
706 4 : haveTblspcMap = true;
707 : }
708 :
709 : /* tell the caller to delete it later */
710 146 : haveBackupLabel = true;
711 : }
712 : else
713 : {
714 : /* No backup_label file has been found if we are here. */
715 :
716 : /*
717 : * If tablespace_map file is present without backup_label file, there
718 : * is no use of such file. There is no harm in retaining it, but it
719 : * is better to get rid of the map file so that we don't have any
720 : * redundant file in data directory and it will avoid any sort of
721 : * confusion. It seems prudent though to just rename the file out of
722 : * the way rather than delete it completely, also we ignore any error
723 : * that occurs in rename operation as even if map file is present
724 : * without backup_label file, it is harmless.
725 : */
726 1770 : if (stat(TABLESPACE_MAP, &st) == 0)
727 : {
728 2 : unlink(TABLESPACE_MAP_OLD);
729 2 : if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
730 2 : ereport(LOG,
731 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
732 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
733 : errdetail("File \"%s\" was renamed to \"%s\".",
734 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
735 : else
736 0 : ereport(LOG,
737 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
738 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
739 : errdetail("Could not rename file \"%s\" to \"%s\": %m.",
740 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
741 : }
742 :
743 : /*
744 : * It's possible that archive recovery was requested, but we don't
745 : * know how far we need to replay the WAL before we reach consistency.
746 : * This can happen for example if a base backup is taken from a
747 : * running server using an atomic filesystem snapshot, without calling
748 : * pg_backup_start/stop. Or if you just kill a running primary server
749 : * and put it into archive recovery by creating a recovery signal
750 : * file.
751 : *
752 : * Our strategy in that case is to perform crash recovery first,
753 : * replaying all the WAL present in pg_wal, and only enter archive
754 : * recovery after that.
755 : *
756 : * But usually we already know how far we need to replay the WAL (up
757 : * to minRecoveryPoint, up to backupEndPoint, or until we see an
758 : * end-of-backup record), and we can enter archive recovery directly.
759 : */
760 1770 : if (ArchiveRecoveryRequested &&
761 86 : (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) ||
762 18 : ControlFile->backupEndRequired ||
763 18 : XLogRecPtrIsValid(ControlFile->backupEndPoint) ||
764 18 : ControlFile->state == DB_SHUTDOWNED))
765 : {
766 82 : InArchiveRecovery = true;
767 82 : if (StandbyModeRequested)
768 82 : EnableStandbyMode();
769 : }
770 :
771 : /*
772 : * For the same reason as when starting up with backup_label present,
773 : * emit a log message when we continue initializing from a base
774 : * backup.
775 : */
776 1770 : if (XLogRecPtrIsValid(ControlFile->backupStartPoint))
777 0 : ereport(LOG,
778 : errmsg("restarting backup recovery with redo LSN %X/%08X",
779 : LSN_FORMAT_ARGS(ControlFile->backupStartPoint)));
780 :
781 : /* Get the last valid checkpoint record. */
782 1770 : CheckPointLoc = ControlFile->checkPoint;
783 1770 : CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
784 1770 : RedoStartLSN = ControlFile->checkPointCopy.redo;
785 1770 : RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
786 1770 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
787 : CheckPointTLI);
788 1770 : if (record != NULL)
789 : {
790 1770 : ereport(DEBUG1,
791 : errmsg_internal("checkpoint record is at %X/%08X",
792 : LSN_FORMAT_ARGS(CheckPointLoc)));
793 : }
794 : else
795 : {
796 : /*
797 : * We used to attempt to go back to a secondary checkpoint record
798 : * here, but only when not in standby mode. We now just fail if we
799 : * can't read the last checkpoint because this allows us to
800 : * simplify processing around checkpoints.
801 : */
802 0 : ereport(PANIC,
803 : errmsg("could not locate a valid checkpoint record at %X/%08X",
804 : LSN_FORMAT_ARGS(CheckPointLoc)));
805 : }
806 1770 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
807 1770 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
808 : }
809 :
810 1916 : if (ArchiveRecoveryRequested)
811 : {
812 220 : if (StandbyModeRequested)
813 210 : ereport(LOG,
814 : (errmsg("entering standby mode")));
815 10 : else if (recoveryTarget == RECOVERY_TARGET_XID)
816 0 : ereport(LOG,
817 : (errmsg("starting point-in-time recovery to XID %u",
818 : recoveryTargetXid)));
819 10 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
820 0 : ereport(LOG,
821 : (errmsg("starting point-in-time recovery to %s",
822 : timestamptz_to_str(recoveryTargetTime))));
823 10 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
824 6 : ereport(LOG,
825 : (errmsg("starting point-in-time recovery to \"%s\"",
826 : recoveryTargetName)));
827 4 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
828 0 : ereport(LOG,
829 : errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
830 : LSN_FORMAT_ARGS(recoveryTargetLSN)));
831 4 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
832 0 : ereport(LOG,
833 : (errmsg("starting point-in-time recovery to earliest consistent point")));
834 : else
835 4 : ereport(LOG,
836 : (errmsg("starting archive recovery")));
837 : }
838 :
839 : /*
840 : * If the location of the checkpoint record is not on the expected
841 : * timeline in the history of the requested timeline, we cannot proceed:
842 : * the backup is not part of the history of the requested timeline.
843 : */
844 : Assert(expectedTLEs); /* was initialized by reading checkpoint
845 : * record */
846 1916 : if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
847 : CheckPointTLI)
848 : {
849 : XLogRecPtr switchpoint;
850 :
851 : /*
852 : * tliSwitchPoint will throw an error if the checkpoint's timeline is
853 : * not in expectedTLEs at all.
854 : */
855 0 : switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
856 0 : ereport(FATAL,
857 : (errmsg("requested timeline %u is not a child of this server's history",
858 : recoveryTargetTLI),
859 : /* translator: %s is a backup_label file or a pg_control file */
860 : errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
861 : haveBackupLabel ? "backup_label" : "pg_control",
862 : LSN_FORMAT_ARGS(CheckPointLoc),
863 : CheckPointTLI,
864 : LSN_FORMAT_ARGS(switchpoint))));
865 : }
866 :
867 : /*
868 : * The min recovery point should be part of the requested timeline's
869 : * history, too.
870 : */
871 1916 : if (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) &&
872 80 : tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
873 80 : ControlFile->minRecoveryPointTLI)
874 0 : ereport(FATAL,
875 : errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
876 : recoveryTargetTLI,
877 : LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
878 : ControlFile->minRecoveryPointTLI));
879 :
880 1916 : ereport(DEBUG1,
881 : errmsg_internal("redo record is at %X/%08X; shutdown %s",
882 : LSN_FORMAT_ARGS(checkPoint.redo),
883 : wasShutdown ? "true" : "false"));
884 1916 : ereport(DEBUG1,
885 : (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
886 : U64FromFullTransactionId(checkPoint.nextXid),
887 : checkPoint.nextOid)));
888 1916 : ereport(DEBUG1,
889 : (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
890 : checkPoint.nextMulti, checkPoint.nextMultiOffset)));
891 1916 : ereport(DEBUG1,
892 : (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
893 : checkPoint.oldestXid, checkPoint.oldestXidDB)));
894 1916 : ereport(DEBUG1,
895 : (errmsg_internal("oldest MultiXactId: %u, in database %u",
896 : checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
897 1916 : ereport(DEBUG1,
898 : (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
899 : checkPoint.oldestCommitTsXid,
900 : checkPoint.newestCommitTsXid)));
901 1916 : if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
902 0 : ereport(PANIC,
903 : (errmsg("invalid next transaction ID")));
904 :
905 : /* sanity check */
906 1916 : if (checkPoint.redo > CheckPointLoc)
907 0 : ereport(PANIC,
908 : (errmsg("invalid redo in checkpoint record")));
909 :
910 : /*
911 : * Check whether we need to force recovery from WAL. If it appears to
912 : * have been a clean shutdown and we did not have a recovery signal file,
913 : * then assume no recovery needed.
914 : */
915 1916 : if (checkPoint.redo < CheckPointLoc)
916 : {
917 230 : if (wasShutdown)
918 0 : ereport(PANIC,
919 : (errmsg("invalid redo record in shutdown checkpoint")));
920 230 : InRecovery = true;
921 : }
922 1686 : else if (ControlFile->state != DB_SHUTDOWNED)
923 188 : InRecovery = true;
924 1498 : else if (ArchiveRecoveryRequested)
925 : {
926 : /* force recovery due to presence of recovery signal file */
927 14 : InRecovery = true;
928 : }
929 :
930 : /*
931 : * If recovery is needed, update our in-memory copy of pg_control to show
932 : * that we are recovering and to show the selected checkpoint as the place
933 : * we are starting from. We also mark pg_control with any minimum recovery
934 : * stop point obtained from a backup history file.
935 : *
936 : * We don't write the changes to disk yet, though. Only do that after
937 : * initializing various subsystems.
938 : */
939 1916 : if (InRecovery)
940 : {
941 432 : if (InArchiveRecovery)
942 : {
943 228 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
944 : }
945 : else
946 : {
947 204 : ereport(LOG,
948 : (errmsg("database system was not properly shut down; "
949 : "automatic recovery in progress")));
950 204 : if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
951 4 : ereport(LOG,
952 : (errmsg("crash recovery starts in timeline %u "
953 : "and has target timeline %u",
954 : ControlFile->checkPointCopy.ThisTimeLineID,
955 : recoveryTargetTLI)));
956 204 : ControlFile->state = DB_IN_CRASH_RECOVERY;
957 : }
958 432 : ControlFile->checkPoint = CheckPointLoc;
959 432 : ControlFile->checkPointCopy = checkPoint;
960 432 : if (InArchiveRecovery)
961 : {
962 : /* initialize minRecoveryPoint if not set yet */
963 228 : if (ControlFile->minRecoveryPoint < checkPoint.redo)
964 : {
965 152 : ControlFile->minRecoveryPoint = checkPoint.redo;
966 152 : ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
967 : }
968 : }
969 :
970 : /*
971 : * Set backupStartPoint if we're starting recovery from a base backup.
972 : *
973 : * Also set backupEndPoint and use minRecoveryPoint as the backup end
974 : * location if we're starting recovery from a base backup which was
975 : * taken from a standby. In this case, the database system status in
976 : * pg_control must indicate that the database was already in recovery.
977 : * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
978 : * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
979 : * before reaching this point; e.g. because restore_command or
980 : * primary_conninfo were faulty.
981 : *
982 : * Any other state indicates that the backup somehow became corrupted
983 : * and we can't sensibly continue with recovery.
984 : */
985 432 : if (haveBackupLabel)
986 : {
987 146 : ControlFile->backupStartPoint = checkPoint.redo;
988 146 : ControlFile->backupEndRequired = backupEndRequired;
989 :
990 146 : if (backupFromStandby)
991 : {
992 8 : if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
993 : dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
994 0 : ereport(FATAL,
995 : (errmsg("backup_label contains data inconsistent with control file"),
996 : errhint("This means that the backup is corrupted and you will "
997 : "have to use another backup for recovery.")));
998 8 : ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
999 : }
1000 : }
1001 : }
1002 :
1003 : /* remember these, so that we know when we have reached consistency */
1004 1916 : backupStartPoint = ControlFile->backupStartPoint;
1005 1916 : backupEndRequired = ControlFile->backupEndRequired;
1006 1916 : backupEndPoint = ControlFile->backupEndPoint;
1007 1916 : if (InArchiveRecovery)
1008 : {
1009 228 : minRecoveryPoint = ControlFile->minRecoveryPoint;
1010 228 : minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1011 : }
1012 : else
1013 : {
1014 1688 : minRecoveryPoint = InvalidXLogRecPtr;
1015 1688 : minRecoveryPointTLI = 0;
1016 : }
1017 :
1018 : /*
1019 : * Start recovery assuming that the final record isn't lost.
1020 : */
1021 1916 : abortedRecPtr = InvalidXLogRecPtr;
1022 1916 : missingContrecPtr = InvalidXLogRecPtr;
1023 :
1024 1916 : *wasShutdown_ptr = wasShutdown;
1025 1916 : *haveBackupLabel_ptr = haveBackupLabel;
1026 1916 : *haveTblspcMap_ptr = haveTblspcMap;
1027 1916 : }
1028 :
1029 : /*
1030 : * See if there are any recovery signal files and if so, set state for
1031 : * recovery.
1032 : *
1033 : * See if there is a recovery command file (recovery.conf), and if so
1034 : * throw an ERROR since as of PG12 we no longer recognize that.
1035 : */
1036 : static void
1037 1916 : readRecoverySignalFile(void)
1038 : {
1039 : struct stat stat_buf;
1040 :
1041 1916 : if (IsBootstrapProcessingMode())
1042 1696 : return;
1043 :
1044 : /*
1045 : * Check for old recovery API file: recovery.conf
1046 : */
1047 1816 : if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1048 0 : ereport(FATAL,
1049 : (errcode_for_file_access(),
1050 : errmsg("using recovery command file \"%s\" is not supported",
1051 : RECOVERY_COMMAND_FILE)));
1052 :
1053 : /*
1054 : * Remove unused .done file, if present. Ignore if absent.
1055 : */
1056 1816 : unlink(RECOVERY_COMMAND_DONE);
1057 :
1058 : /*
1059 : * Check for recovery signal files and if found, fsync them since they
1060 : * represent server state information. We don't sweat too much about the
1061 : * possibility of fsync failure, however.
1062 : *
1063 : * If present, standby signal file takes precedence. If neither is present
1064 : * then we won't enter archive recovery.
1065 : */
1066 1816 : if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1067 : {
1068 : int fd;
1069 :
1070 210 : fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1071 : S_IRUSR | S_IWUSR);
1072 210 : if (fd >= 0)
1073 : {
1074 210 : (void) pg_fsync(fd);
1075 210 : close(fd);
1076 : }
1077 210 : standby_signal_file_found = true;
1078 : }
1079 1606 : else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1080 : {
1081 : int fd;
1082 :
1083 10 : fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1084 : S_IRUSR | S_IWUSR);
1085 10 : if (fd >= 0)
1086 : {
1087 10 : (void) pg_fsync(fd);
1088 10 : close(fd);
1089 : }
1090 10 : recovery_signal_file_found = true;
1091 : }
1092 :
1093 1816 : StandbyModeRequested = false;
1094 1816 : ArchiveRecoveryRequested = false;
1095 1816 : if (standby_signal_file_found)
1096 : {
1097 210 : StandbyModeRequested = true;
1098 210 : ArchiveRecoveryRequested = true;
1099 : }
1100 1606 : else if (recovery_signal_file_found)
1101 : {
1102 10 : StandbyModeRequested = false;
1103 10 : ArchiveRecoveryRequested = true;
1104 : }
1105 : else
1106 1596 : return;
1107 :
1108 : /*
1109 : * We don't support standby mode in standalone backends; that requires
1110 : * other processes such as the WAL receiver to be alive.
1111 : */
1112 220 : if (StandbyModeRequested && !IsUnderPostmaster)
1113 0 : ereport(FATAL,
1114 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1115 : errmsg("standby mode is not supported by single-user servers")));
1116 : }
1117 :
1118 : static void
1119 1916 : validateRecoveryParameters(void)
1120 : {
1121 1916 : if (!ArchiveRecoveryRequested)
1122 1696 : return;
1123 :
1124 : /*
1125 : * Check for compulsory parameters
1126 : */
1127 220 : if (StandbyModeRequested)
1128 : {
1129 210 : if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1130 22 : (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1131 4 : ereport(WARNING,
1132 : (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1133 : errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1134 : }
1135 : else
1136 : {
1137 10 : if (recoveryRestoreCommand == NULL ||
1138 10 : strcmp(recoveryRestoreCommand, "") == 0)
1139 0 : ereport(FATAL,
1140 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1141 : errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1142 : }
1143 :
1144 : /*
1145 : * Override any inconsistent requests. Note that this is a change of
1146 : * behaviour in 9.5; prior to this we simply ignored a request to pause if
1147 : * hot_standby = off, which was surprising behaviour.
1148 : */
1149 220 : if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1150 206 : !EnableHotStandby)
1151 6 : recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1152 :
1153 : /*
1154 : * Final parsing of recovery_target_time string; see also
1155 : * check_recovery_target_time().
1156 : */
1157 220 : if (recoveryTarget == RECOVERY_TARGET_TIME)
1158 : {
1159 0 : recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1160 : CStringGetDatum(recovery_target_time_string),
1161 : ObjectIdGetDatum(InvalidOid),
1162 : Int32GetDatum(-1)));
1163 : }
1164 :
1165 : /*
1166 : * If user specified recovery_target_timeline, validate it or compute the
1167 : * "latest" value. We can't do this until after we've gotten the restore
1168 : * command and set InArchiveRecovery, because we need to fetch timeline
1169 : * history files from the archive.
1170 : */
1171 220 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1172 : {
1173 0 : TimeLineID rtli = recoveryTargetTLIRequested;
1174 :
1175 : /* Timeline 1 does not have a history file, all else should */
1176 0 : if (rtli != 1 && !existsTimeLineHistory(rtli))
1177 0 : ereport(FATAL,
1178 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1179 : errmsg("recovery target timeline %u does not exist",
1180 : rtli)));
1181 0 : recoveryTargetTLI = rtli;
1182 : }
1183 220 : else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1184 : {
1185 : /* We start the "latest" search from pg_control's timeline */
1186 220 : recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1187 : }
1188 : else
1189 : {
1190 : /*
1191 : * else we just use the recoveryTargetTLI as already read from
1192 : * ControlFile
1193 : */
1194 : Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1195 : }
1196 : }
1197 :
1198 : /*
1199 : * read_backup_label: check to see if a backup_label file is present
1200 : *
1201 : * If we see a backup_label during recovery, we assume that we are recovering
1202 : * from a backup dump file, and we therefore roll forward from the checkpoint
1203 : * identified by the label file, NOT what pg_control says. This avoids the
1204 : * problem that pg_control might have been archived one or more checkpoints
1205 : * later than the start of the dump, and so if we rely on it as the start
1206 : * point, we will fail to restore a consistent database state.
1207 : *
1208 : * Returns true if a backup_label was found (and fills the checkpoint
1209 : * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1210 : * returns false if not. If this backup_label came from a streamed backup,
1211 : * *backupEndRequired is set to true. If this backup_label was created during
1212 : * recovery, *backupFromStandby is set to true.
1213 : *
1214 : * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1215 : * and TLI read from the backup file.
1216 : */
1217 : static bool
1218 1916 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1219 : bool *backupEndRequired, bool *backupFromStandby)
1220 : {
1221 : char startxlogfilename[MAXFNAMELEN];
1222 : TimeLineID tli_from_walseg,
1223 : tli_from_file;
1224 : FILE *lfp;
1225 : char ch;
1226 : char backuptype[20];
1227 : char backupfrom[20];
1228 : char backuplabel[MAXPGPATH];
1229 : char backuptime[128];
1230 : uint32 hi,
1231 : lo;
1232 :
1233 : /* suppress possible uninitialized-variable warnings */
1234 1916 : *checkPointLoc = InvalidXLogRecPtr;
1235 1916 : *backupLabelTLI = 0;
1236 1916 : *backupEndRequired = false;
1237 1916 : *backupFromStandby = false;
1238 :
1239 : /*
1240 : * See if label file is present
1241 : */
1242 1916 : lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1243 1916 : if (!lfp)
1244 : {
1245 1770 : if (errno != ENOENT)
1246 0 : ereport(FATAL,
1247 : (errcode_for_file_access(),
1248 : errmsg("could not read file \"%s\": %m",
1249 : BACKUP_LABEL_FILE)));
1250 1770 : return false; /* it's not there, all is fine */
1251 : }
1252 :
1253 : /*
1254 : * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1255 : * is pretty crude, but we are not expecting any variability in the file
1256 : * format).
1257 : */
1258 146 : if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1259 146 : &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1260 0 : ereport(FATAL,
1261 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1262 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1263 146 : RedoStartLSN = ((uint64) hi) << 32 | lo;
1264 146 : RedoStartTLI = tli_from_walseg;
1265 146 : if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1266 146 : &hi, &lo, &ch) != 3 || ch != '\n')
1267 0 : ereport(FATAL,
1268 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1269 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1270 146 : *checkPointLoc = ((uint64) hi) << 32 | lo;
1271 146 : *backupLabelTLI = tli_from_walseg;
1272 :
1273 : /*
1274 : * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1275 : * which could mean either pg_basebackup or the pg_backup_start/stop
1276 : * method was used) or if this label came from somewhere else (the only
1277 : * other option today being from pg_rewind). If this was a streamed
1278 : * backup then we know that we need to play through until we get to the
1279 : * end of the WAL which was generated during the backup (at which point we
1280 : * will have reached consistency and backupEndRequired will be reset to be
1281 : * false).
1282 : */
1283 146 : if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1284 : {
1285 146 : if (strcmp(backuptype, "streamed") == 0)
1286 144 : *backupEndRequired = true;
1287 : }
1288 :
1289 : /*
1290 : * BACKUP FROM lets us know if this was from a primary or a standby. If
1291 : * it was from a standby, we'll double-check that the control file state
1292 : * matches that of a standby.
1293 : */
1294 146 : if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1295 : {
1296 146 : if (strcmp(backupfrom, "standby") == 0)
1297 8 : *backupFromStandby = true;
1298 : }
1299 :
1300 : /*
1301 : * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1302 : * but checking for their presence is useful for debugging and the next
1303 : * sanity checks. Cope also with the fact that the result buffers have a
1304 : * pre-allocated size, hence if the backup_label file has been generated
1305 : * with strings longer than the maximum assumed here an incorrect parsing
1306 : * happens. That's fine as only minor consistency checks are done
1307 : * afterwards.
1308 : */
1309 146 : if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1310 146 : ereport(DEBUG1,
1311 : (errmsg_internal("backup time %s in file \"%s\"",
1312 : backuptime, BACKUP_LABEL_FILE)));
1313 :
1314 146 : if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1315 144 : ereport(DEBUG1,
1316 : (errmsg_internal("backup label %s in file \"%s\"",
1317 : backuplabel, BACKUP_LABEL_FILE)));
1318 :
1319 : /*
1320 : * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1321 : * it as a sanity check if present.
1322 : */
1323 146 : if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1324 : {
1325 144 : if (tli_from_walseg != tli_from_file)
1326 0 : ereport(FATAL,
1327 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1328 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1329 : errdetail("Timeline ID parsed is %u, but expected %u.",
1330 : tli_from_file, tli_from_walseg)));
1331 :
1332 144 : ereport(DEBUG1,
1333 : (errmsg_internal("backup timeline %u in file \"%s\"",
1334 : tli_from_file, BACKUP_LABEL_FILE)));
1335 : }
1336 :
1337 146 : if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1338 0 : ereport(FATAL,
1339 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1340 : errmsg("this is an incremental backup, not a data directory"),
1341 : errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1342 :
1343 146 : if (ferror(lfp) || FreeFile(lfp))
1344 0 : ereport(FATAL,
1345 : (errcode_for_file_access(),
1346 : errmsg("could not read file \"%s\": %m",
1347 : BACKUP_LABEL_FILE)));
1348 :
1349 146 : return true;
1350 : }
1351 :
1352 : /*
1353 : * read_tablespace_map: check to see if a tablespace_map file is present
1354 : *
1355 : * If we see a tablespace_map file during recovery, we assume that we are
1356 : * recovering from a backup dump file, and we therefore need to create symlinks
1357 : * as per the information present in tablespace_map file.
1358 : *
1359 : * Returns true if a tablespace_map file was found (and fills *tablespaces
1360 : * with a tablespaceinfo struct for each tablespace listed in the file);
1361 : * returns false if not.
1362 : */
1363 : static bool
1364 146 : read_tablespace_map(List **tablespaces)
1365 : {
1366 : tablespaceinfo *ti;
1367 : FILE *lfp;
1368 : char str[MAXPGPATH];
1369 : int ch,
1370 : i,
1371 : n;
1372 : bool was_backslash;
1373 :
1374 : /*
1375 : * See if tablespace_map file is present
1376 : */
1377 146 : lfp = AllocateFile(TABLESPACE_MAP, "r");
1378 146 : if (!lfp)
1379 : {
1380 142 : if (errno != ENOENT)
1381 0 : ereport(FATAL,
1382 : (errcode_for_file_access(),
1383 : errmsg("could not read file \"%s\": %m",
1384 : TABLESPACE_MAP)));
1385 142 : return false; /* it's not there, all is fine */
1386 : }
1387 :
1388 : /*
1389 : * Read and parse the link name and path lines from tablespace_map file
1390 : * (this code is pretty crude, but we are not expecting any variability in
1391 : * the file format). De-escape any backslashes that were inserted.
1392 : */
1393 4 : i = 0;
1394 4 : was_backslash = false;
1395 154 : while ((ch = fgetc(lfp)) != EOF)
1396 : {
1397 150 : if (!was_backslash && (ch == '\n' || ch == '\r'))
1398 4 : {
1399 : char *endp;
1400 :
1401 4 : if (i == 0)
1402 0 : continue; /* \r immediately followed by \n */
1403 :
1404 : /*
1405 : * The de-escaped line should contain an OID followed by exactly
1406 : * one space followed by a path. The path might start with
1407 : * spaces, so don't be too liberal about parsing.
1408 : */
1409 4 : str[i] = '\0';
1410 4 : n = 0;
1411 24 : while (str[n] && str[n] != ' ')
1412 20 : n++;
1413 4 : if (n < 1 || n >= i - 1)
1414 0 : ereport(FATAL,
1415 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1416 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1417 4 : str[n++] = '\0';
1418 :
1419 4 : ti = palloc0(sizeof(tablespaceinfo));
1420 4 : errno = 0;
1421 4 : ti->oid = strtoul(str, &endp, 10);
1422 4 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1423 0 : ereport(FATAL,
1424 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1425 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1426 4 : ti->path = pstrdup(str + n);
1427 4 : *tablespaces = lappend(*tablespaces, ti);
1428 :
1429 4 : i = 0;
1430 4 : continue;
1431 : }
1432 146 : else if (!was_backslash && ch == '\\')
1433 0 : was_backslash = true;
1434 : else
1435 : {
1436 146 : if (i < sizeof(str) - 1)
1437 146 : str[i++] = ch;
1438 146 : was_backslash = false;
1439 : }
1440 : }
1441 :
1442 4 : if (i != 0 || was_backslash) /* last line not terminated? */
1443 0 : ereport(FATAL,
1444 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1445 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1446 :
1447 4 : if (ferror(lfp) || FreeFile(lfp))
1448 0 : ereport(FATAL,
1449 : (errcode_for_file_access(),
1450 : errmsg("could not read file \"%s\": %m",
1451 : TABLESPACE_MAP)));
1452 :
1453 4 : return true;
1454 : }
1455 :
1456 : /*
1457 : * Finish WAL recovery.
1458 : *
1459 : * This does not close the 'xlogreader' yet, because in some cases the caller
1460 : * still wants to re-read the last checkpoint record by calling
1461 : * ReadCheckpointRecord().
1462 : *
1463 : * Returns the position of the last valid or applied record, after which new
1464 : * WAL should be appended, information about why recovery was ended, and some
1465 : * other things. See the EndOfWalRecoveryInfo struct for details.
1466 : */
1467 : EndOfWalRecoveryInfo *
1468 1798 : FinishWalRecovery(void)
1469 : {
1470 1798 : EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo));
1471 : XLogRecPtr lastRec;
1472 : TimeLineID lastRecTLI;
1473 : XLogRecPtr endOfLog;
1474 :
1475 : /*
1476 : * Kill WAL receiver, if it's still running, before we continue to write
1477 : * the startup checkpoint and aborted-contrecord records. It will trump
1478 : * over these records and subsequent ones if it's still alive when we
1479 : * start writing WAL.
1480 : */
1481 1798 : XLogShutdownWalRcv();
1482 :
1483 : /*
1484 : * Shutdown the slot sync worker to drop any temporary slots acquired by
1485 : * it and to prevent it from keep trying to fetch the failover slots.
1486 : *
1487 : * We do not update the 'synced' column in 'pg_replication_slots' system
1488 : * view from true to false here, as any failed update could leave 'synced'
1489 : * column false for some slots. This could cause issues during slot sync
1490 : * after restarting the server as a standby. While updating the 'synced'
1491 : * column after switching to the new timeline is an option, it does not
1492 : * simplify the handling for the 'synced' column. Therefore, we retain the
1493 : * 'synced' column as true after promotion as it may provide useful
1494 : * information about the slot origin.
1495 : */
1496 1798 : ShutDownSlotSync();
1497 :
1498 : /*
1499 : * We are now done reading the xlog from stream. Turn off streaming
1500 : * recovery to force fetching the files (which would be required at end of
1501 : * recovery, e.g., timeline history file) from archive or pg_wal.
1502 : *
1503 : * Note that standby mode must be turned off after killing WAL receiver,
1504 : * i.e., calling XLogShutdownWalRcv().
1505 : */
1506 : Assert(!WalRcvStreaming());
1507 1798 : StandbyMode = false;
1508 :
1509 : /*
1510 : * Determine where to start writing WAL next.
1511 : *
1512 : * Re-fetch the last valid or last applied record, so we can identify the
1513 : * exact endpoint of what we consider the valid portion of WAL. There may
1514 : * be an incomplete continuation record after that, in which case
1515 : * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1516 : * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1517 : * it is intentionally missing. See CreateOverwriteContrecordRecord().
1518 : *
1519 : * An important side-effect of this is to load the last page into
1520 : * xlogreader. The caller uses it to initialize the WAL for writing.
1521 : */
1522 1798 : if (!InRecovery)
1523 : {
1524 1484 : lastRec = CheckPointLoc;
1525 1484 : lastRecTLI = CheckPointTLI;
1526 : }
1527 : else
1528 : {
1529 314 : lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1530 314 : lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1531 : }
1532 1798 : XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1533 1798 : (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1534 1798 : endOfLog = xlogreader->EndRecPtr;
1535 :
1536 : /*
1537 : * Remember the TLI in the filename of the XLOG segment containing the
1538 : * end-of-log. It could be different from the timeline that endOfLog
1539 : * nominally belongs to, if there was a timeline switch in that segment,
1540 : * and we were reading the old WAL from a segment belonging to a higher
1541 : * timeline.
1542 : */
1543 1798 : result->endOfLogTLI = xlogreader->seg.ws_tli;
1544 :
1545 1798 : if (ArchiveRecoveryRequested)
1546 : {
1547 : /*
1548 : * We are no longer in archive recovery state.
1549 : *
1550 : * We are now done reading the old WAL. Turn off archive fetching if
1551 : * it was active.
1552 : */
1553 : Assert(InArchiveRecovery);
1554 102 : InArchiveRecovery = false;
1555 :
1556 : /*
1557 : * If the ending log segment is still open, close it (to avoid
1558 : * problems on Windows with trying to rename or delete an open file).
1559 : */
1560 102 : if (readFile >= 0)
1561 : {
1562 102 : close(readFile);
1563 102 : readFile = -1;
1564 : }
1565 : }
1566 :
1567 : /*
1568 : * Copy the last partial block to the caller, for initializing the WAL
1569 : * buffer for appending new WAL.
1570 : */
1571 1798 : if (endOfLog % XLOG_BLCKSZ != 0)
1572 : {
1573 : char *page;
1574 : int len;
1575 : XLogRecPtr pageBeginPtr;
1576 :
1577 1760 : pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1578 : Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1579 :
1580 : /* Copy the valid part of the last block */
1581 1760 : len = endOfLog % XLOG_BLCKSZ;
1582 1760 : page = palloc(len);
1583 1760 : memcpy(page, xlogreader->readBuf, len);
1584 :
1585 1760 : result->lastPageBeginPtr = pageBeginPtr;
1586 1760 : result->lastPage = page;
1587 : }
1588 : else
1589 : {
1590 : /* There is no partial block to copy. */
1591 38 : result->lastPageBeginPtr = endOfLog;
1592 38 : result->lastPage = NULL;
1593 : }
1594 :
1595 : /*
1596 : * Create a comment for the history file to explain why and where timeline
1597 : * changed.
1598 : */
1599 1798 : result->recoveryStopReason = getRecoveryStopReason();
1600 :
1601 1798 : result->lastRec = lastRec;
1602 1798 : result->lastRecTLI = lastRecTLI;
1603 1798 : result->endOfLog = endOfLog;
1604 :
1605 1798 : result->abortedRecPtr = abortedRecPtr;
1606 1798 : result->missingContrecPtr = missingContrecPtr;
1607 :
1608 1798 : result->standby_signal_file_found = standby_signal_file_found;
1609 1798 : result->recovery_signal_file_found = recovery_signal_file_found;
1610 :
1611 1798 : return result;
1612 : }
1613 :
1614 : /*
1615 : * Clean up the WAL reader and leftovers from restoring WAL from archive
1616 : */
1617 : void
1618 1798 : ShutdownWalRecovery(void)
1619 : {
1620 : char recoveryPath[MAXPGPATH];
1621 :
1622 : /* Final update of pg_stat_recovery_prefetch. */
1623 1798 : XLogPrefetcherComputeStats(xlogprefetcher);
1624 :
1625 : /* Shut down xlogreader */
1626 1798 : if (readFile >= 0)
1627 : {
1628 1696 : close(readFile);
1629 1696 : readFile = -1;
1630 : }
1631 1798 : pfree(xlogreader->private_data);
1632 1798 : XLogReaderFree(xlogreader);
1633 1798 : XLogPrefetcherFree(xlogprefetcher);
1634 :
1635 1798 : if (ArchiveRecoveryRequested)
1636 : {
1637 : /*
1638 : * Since there might be a partial WAL segment named RECOVERYXLOG, get
1639 : * rid of it.
1640 : */
1641 102 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1642 102 : unlink(recoveryPath); /* ignore any error */
1643 :
1644 : /* Get rid of any remaining recovered timeline-history file, too */
1645 102 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1646 102 : unlink(recoveryPath); /* ignore any error */
1647 : }
1648 :
1649 : /*
1650 : * We don't need the latch anymore. It's not strictly necessary to disown
1651 : * it, but let's do it for the sake of tidiness.
1652 : */
1653 1798 : if (ArchiveRecoveryRequested)
1654 102 : DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1655 1798 : }
1656 :
1657 : /*
1658 : * Perform WAL recovery.
1659 : *
1660 : * If the system was shut down cleanly, this is never called.
1661 : */
1662 : void
1663 430 : PerformWalRecovery(void)
1664 : {
1665 : XLogRecord *record;
1666 430 : bool reachedRecoveryTarget = false;
1667 : TimeLineID replayTLI;
1668 :
1669 : /*
1670 : * Initialize shared variables for tracking progress of WAL replay, as if
1671 : * we had just replayed the record before the REDO location (or the
1672 : * checkpoint record itself, if it's a shutdown checkpoint).
1673 : */
1674 430 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1675 430 : if (RedoStartLSN < CheckPointLoc)
1676 : {
1677 228 : XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1678 228 : XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1679 228 : XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1680 : }
1681 : else
1682 : {
1683 202 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1684 202 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1685 202 : XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1686 : }
1687 430 : XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1688 430 : XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1689 430 : XLogRecoveryCtl->recoveryLastXTime = 0;
1690 430 : XLogRecoveryCtl->currentChunkStartTime = 0;
1691 430 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1692 430 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1693 :
1694 : /* Also ensure XLogReceiptTime has a sane value */
1695 430 : XLogReceiptTime = GetCurrentTimestamp();
1696 :
1697 : /*
1698 : * Let postmaster know we've started redo now, so that it can launch the
1699 : * archiver if necessary.
1700 : */
1701 430 : if (IsUnderPostmaster)
1702 412 : SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1703 :
1704 : /*
1705 : * Allow read-only connections immediately if we're consistent already.
1706 : */
1707 430 : CheckRecoveryConsistency();
1708 :
1709 : /*
1710 : * Find the first record that logically follows the checkpoint --- it
1711 : * might physically precede it, though.
1712 : */
1713 430 : if (RedoStartLSN < CheckPointLoc)
1714 : {
1715 : /* back up to find the record */
1716 228 : replayTLI = RedoStartTLI;
1717 228 : XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1718 228 : record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1719 :
1720 : /*
1721 : * If a checkpoint record's redo pointer points back to an earlier
1722 : * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1723 : * record.
1724 : */
1725 228 : if (record->xl_rmid != RM_XLOG_ID ||
1726 228 : (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1727 0 : ereport(FATAL,
1728 : errmsg("unexpected record type found at redo point %X/%08X",
1729 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1730 : }
1731 : else
1732 : {
1733 : /* just have to read next record after CheckPoint */
1734 : Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1735 202 : replayTLI = CheckPointTLI;
1736 202 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1737 : }
1738 :
1739 430 : if (record != NULL)
1740 : {
1741 : TimestampTz xtime;
1742 : PGRUsage ru0;
1743 :
1744 412 : pg_rusage_init(&ru0);
1745 :
1746 412 : InRedo = true;
1747 :
1748 412 : RmgrStartup();
1749 :
1750 412 : ereport(LOG,
1751 : errmsg("redo starts at %X/%08X",
1752 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1753 :
1754 : /* Prepare to report progress of the redo phase. */
1755 412 : if (!StandbyMode)
1756 214 : begin_startup_progress_phase();
1757 :
1758 : /*
1759 : * main redo apply loop
1760 : */
1761 : do
1762 : {
1763 5512684 : if (!StandbyMode)
1764 527424 : ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1765 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1766 :
1767 : #ifdef WAL_DEBUG
1768 : if (XLOG_DEBUG)
1769 : {
1770 : StringInfoData buf;
1771 :
1772 : initStringInfo(&buf);
1773 : appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1774 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1775 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1776 : xlog_outrec(&buf, xlogreader);
1777 : appendStringInfoString(&buf, " - ");
1778 : xlog_outdesc(&buf, xlogreader);
1779 : elog(LOG, "%s", buf.data);
1780 : pfree(buf.data);
1781 : }
1782 : #endif
1783 :
1784 : /* Handle interrupt signals of startup process */
1785 5512684 : ProcessStartupProcInterrupts();
1786 :
1787 : /*
1788 : * Pause WAL replay, if requested by a hot-standby session via
1789 : * SetRecoveryPause().
1790 : *
1791 : * Note that we intentionally don't take the info_lck spinlock
1792 : * here. We might therefore read a slightly stale value of the
1793 : * recoveryPause flag, but it can't be very stale (no worse than
1794 : * the last spinlock we did acquire). Since a pause request is a
1795 : * pretty asynchronous thing anyway, possibly responding to it one
1796 : * WAL record later than we otherwise would is a minor issue, so
1797 : * it doesn't seem worth adding another spinlock cycle to prevent
1798 : * that.
1799 : */
1800 5512684 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1801 : RECOVERY_NOT_PAUSED)
1802 0 : recoveryPausesHere(false);
1803 :
1804 : /*
1805 : * Have we reached our recovery target?
1806 : */
1807 5512684 : if (recoveryStopsBefore(xlogreader))
1808 : {
1809 4 : reachedRecoveryTarget = true;
1810 4 : break;
1811 : }
1812 :
1813 : /*
1814 : * If we've been asked to lag the primary, wait on latch until
1815 : * enough time has passed.
1816 : */
1817 5512680 : if (recoveryApplyDelay(xlogreader))
1818 : {
1819 : /*
1820 : * We test for paused recovery again here. If user sets
1821 : * delayed apply, it may be because they expect to pause
1822 : * recovery in case of problems, so we must test again here
1823 : * otherwise pausing during the delay-wait wouldn't work.
1824 : */
1825 18 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1826 : RECOVERY_NOT_PAUSED)
1827 0 : recoveryPausesHere(false);
1828 : }
1829 :
1830 : /*
1831 : * Apply the record
1832 : */
1833 5512680 : ApplyWalRecord(xlogreader, record, &replayTLI);
1834 :
1835 : /* Exit loop if we reached inclusive recovery target */
1836 5512676 : if (recoveryStopsAfter(xlogreader))
1837 : {
1838 10 : reachedRecoveryTarget = true;
1839 10 : break;
1840 : }
1841 :
1842 : /*
1843 : * If we replayed an LSN that someone was waiting for then walk
1844 : * over the shared memory array and set latches to notify the
1845 : * waiters.
1846 : */
1847 11025332 : if (waitLSNState &&
1848 5512666 : (XLogRecoveryCtl->lastReplayedEndRecPtr >=
1849 5512666 : pg_atomic_read_u64(&waitLSNState->minWaitedLSN[WAIT_LSN_TYPE_REPLAY])))
1850 14 : WaitLSNWakeup(WAIT_LSN_TYPE_REPLAY, XLogRecoveryCtl->lastReplayedEndRecPtr);
1851 :
1852 : /* Else, try to fetch the next WAL record */
1853 5512666 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1854 5512556 : } while (record != NULL);
1855 :
1856 : /*
1857 : * end of main redo apply loop
1858 : */
1859 :
1860 298 : if (reachedRecoveryTarget)
1861 : {
1862 14 : if (!reachedConsistency)
1863 0 : ereport(FATAL,
1864 : (errmsg("requested recovery stop point is before consistent recovery point")));
1865 :
1866 : /*
1867 : * This is the last point where we can restart recovery with a new
1868 : * recovery target, if we shutdown and begin again. After this,
1869 : * Resource Managers may choose to do permanent corrective actions
1870 : * at end of recovery.
1871 : */
1872 14 : switch (recoveryTargetAction)
1873 : {
1874 0 : case RECOVERY_TARGET_ACTION_SHUTDOWN:
1875 :
1876 : /*
1877 : * exit with special return code to request shutdown of
1878 : * postmaster. Log messages issued from postmaster.
1879 : */
1880 0 : proc_exit(3);
1881 :
1882 2 : case RECOVERY_TARGET_ACTION_PAUSE:
1883 2 : SetRecoveryPause(true);
1884 2 : recoveryPausesHere(true);
1885 :
1886 : /* drop into promote */
1887 :
1888 14 : case RECOVERY_TARGET_ACTION_PROMOTE:
1889 14 : break;
1890 : }
1891 : }
1892 :
1893 298 : RmgrCleanup();
1894 :
1895 298 : ereport(LOG,
1896 : errmsg("redo done at %X/%08X system usage: %s",
1897 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1898 : pg_rusage_show(&ru0)));
1899 298 : xtime = GetLatestXTime();
1900 298 : if (xtime)
1901 74 : ereport(LOG,
1902 : (errmsg("last completed transaction was at log time %s",
1903 : timestamptz_to_str(xtime))));
1904 :
1905 298 : InRedo = false;
1906 : }
1907 : else
1908 : {
1909 : /* there are no WAL records following the checkpoint */
1910 18 : ereport(LOG,
1911 : (errmsg("redo is not required")));
1912 : }
1913 :
1914 : /*
1915 : * This check is intentionally after the above log messages that indicate
1916 : * how far recovery went.
1917 : */
1918 316 : if (ArchiveRecoveryRequested &&
1919 104 : recoveryTarget != RECOVERY_TARGET_UNSET &&
1920 16 : !reachedRecoveryTarget)
1921 2 : ereport(FATAL,
1922 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
1923 : errmsg("recovery ended before configured recovery target was reached")));
1924 314 : }
1925 :
1926 : /*
1927 : * Subroutine of PerformWalRecovery, to apply one WAL record.
1928 : */
1929 : static void
1930 5512680 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1931 : {
1932 : ErrorContextCallback errcallback;
1933 5512680 : bool switchedTLI = false;
1934 :
1935 : /* Setup error traceback support for ereport() */
1936 5512680 : errcallback.callback = rm_redo_error_callback;
1937 5512680 : errcallback.arg = xlogreader;
1938 5512680 : errcallback.previous = error_context_stack;
1939 5512680 : error_context_stack = &errcallback;
1940 :
1941 : /*
1942 : * TransamVariables->nextXid must be beyond record's xid.
1943 : */
1944 5512680 : AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1945 :
1946 : /*
1947 : * Before replaying this record, check if this record causes the current
1948 : * timeline to change. The record is already considered to be part of the
1949 : * new timeline, so we update replayTLI before replaying it. That's
1950 : * important so that replayEndTLI, which is recorded as the minimum
1951 : * recovery point's TLI if recovery stops after this record, is set
1952 : * correctly.
1953 : */
1954 5512680 : if (record->xl_rmid == RM_XLOG_ID)
1955 : {
1956 89336 : TimeLineID newReplayTLI = *replayTLI;
1957 89336 : TimeLineID prevReplayTLI = *replayTLI;
1958 89336 : uint8 info = record->xl_info & ~XLR_INFO_MASK;
1959 :
1960 89336 : if (info == XLOG_CHECKPOINT_SHUTDOWN)
1961 : {
1962 : CheckPoint checkPoint;
1963 :
1964 68 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1965 68 : newReplayTLI = checkPoint.ThisTimeLineID;
1966 68 : prevReplayTLI = checkPoint.PrevTimeLineID;
1967 : }
1968 89268 : else if (info == XLOG_END_OF_RECOVERY)
1969 : {
1970 : xl_end_of_recovery xlrec;
1971 :
1972 20 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1973 20 : newReplayTLI = xlrec.ThisTimeLineID;
1974 20 : prevReplayTLI = xlrec.PrevTimeLineID;
1975 : }
1976 :
1977 89336 : if (newReplayTLI != *replayTLI)
1978 : {
1979 : /* Check that it's OK to switch to this TLI */
1980 22 : checkTimeLineSwitch(xlogreader->EndRecPtr,
1981 : newReplayTLI, prevReplayTLI, *replayTLI);
1982 :
1983 : /* Following WAL records should be run with new TLI */
1984 22 : *replayTLI = newReplayTLI;
1985 22 : switchedTLI = true;
1986 : }
1987 : }
1988 :
1989 : /*
1990 : * Update shared replayEndRecPtr before replaying this record, so that
1991 : * XLogFlush will update minRecoveryPoint correctly.
1992 : */
1993 5512680 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1994 5512680 : XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
1995 5512680 : XLogRecoveryCtl->replayEndTLI = *replayTLI;
1996 5512680 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1997 :
1998 : /*
1999 : * If we are attempting to enter Hot Standby mode, process XIDs we see
2000 : */
2001 5512680 : if (standbyState >= STANDBY_INITIALIZED &&
2002 5025070 : TransactionIdIsValid(record->xl_xid))
2003 4914362 : RecordKnownAssignedTransactionIds(record->xl_xid);
2004 :
2005 : /*
2006 : * Some XLOG record types that are related to recovery are processed
2007 : * directly here, rather than in xlog_redo()
2008 : */
2009 5512680 : if (record->xl_rmid == RM_XLOG_ID)
2010 89336 : xlogrecovery_redo(xlogreader, *replayTLI);
2011 :
2012 : /* Now apply the WAL record itself */
2013 5512680 : GetRmgr(record->xl_rmid).rm_redo(xlogreader);
2014 :
2015 : /*
2016 : * After redo, check whether the backup pages associated with the WAL
2017 : * record are consistent with the existing pages. This check is done only
2018 : * if consistency check is enabled for this record.
2019 : */
2020 5512676 : if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2021 4347092 : verifyBackupPageConsistency(xlogreader);
2022 :
2023 : /* Pop the error context stack */
2024 5512676 : error_context_stack = errcallback.previous;
2025 :
2026 : /*
2027 : * Update lastReplayedEndRecPtr after this record has been successfully
2028 : * replayed.
2029 : */
2030 5512676 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2031 5512676 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
2032 5512676 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
2033 5512676 : XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2034 5512676 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2035 :
2036 : /* ------
2037 : * Wakeup walsenders:
2038 : *
2039 : * On the standby, the WAL is flushed first (which will only wake up
2040 : * physical walsenders) and then applied, which will only wake up logical
2041 : * walsenders.
2042 : *
2043 : * Indeed, logical walsenders on standby can't decode and send data until
2044 : * it's been applied.
2045 : *
2046 : * Physical walsenders don't need to be woken up during replay unless
2047 : * cascading replication is allowed and time line change occurred (so that
2048 : * they can notice that they are on a new time line).
2049 : *
2050 : * That's why the wake up conditions are for:
2051 : *
2052 : * - physical walsenders in case of new time line and cascade
2053 : * replication is allowed
2054 : * - logical walsenders in case cascade replication is allowed (could not
2055 : * be created otherwise)
2056 : * ------
2057 : */
2058 5512676 : if (AllowCascadeReplication())
2059 5134452 : WalSndWakeup(switchedTLI, true);
2060 :
2061 : /*
2062 : * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2063 : * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2064 : * a reply to the primary.
2065 : */
2066 5512676 : if (doRequestWalReceiverReply)
2067 : {
2068 4 : doRequestWalReceiverReply = false;
2069 4 : WalRcvForceReply();
2070 : }
2071 :
2072 : /* Allow read-only connections if we're consistent now */
2073 5512676 : CheckRecoveryConsistency();
2074 :
2075 : /* Is this a timeline switch? */
2076 5512676 : if (switchedTLI)
2077 : {
2078 : /*
2079 : * Before we continue on the new timeline, clean up any (possibly
2080 : * bogus) future WAL segments on the old timeline.
2081 : */
2082 22 : RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2083 :
2084 : /* Reset the prefetcher. */
2085 22 : XLogPrefetchReconfigure();
2086 : }
2087 5512676 : }
2088 :
2089 : /*
2090 : * Some XLOG RM record types that are directly related to WAL recovery are
2091 : * handled here rather than in the xlog_redo()
2092 : */
2093 : static void
2094 89336 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2095 : {
2096 89336 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2097 89336 : XLogRecPtr lsn = record->EndRecPtr;
2098 :
2099 : Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2100 :
2101 89336 : if (info == XLOG_OVERWRITE_CONTRECORD)
2102 : {
2103 : /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2104 : xl_overwrite_contrecord xlrec;
2105 :
2106 2 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2107 2 : if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2108 0 : elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2109 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2110 : LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2111 :
2112 : /* We have safely skipped the aborted record */
2113 2 : abortedRecPtr = InvalidXLogRecPtr;
2114 2 : missingContrecPtr = InvalidXLogRecPtr;
2115 :
2116 2 : ereport(LOG,
2117 : errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2118 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2119 : timestamptz_to_str(xlrec.overwrite_time)));
2120 :
2121 : /* Verifying the record should only happen once */
2122 2 : record->overwrittenRecPtr = InvalidXLogRecPtr;
2123 : }
2124 89334 : else if (info == XLOG_BACKUP_END)
2125 : {
2126 : XLogRecPtr startpoint;
2127 :
2128 174 : memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2129 :
2130 174 : if (backupStartPoint == startpoint)
2131 : {
2132 : /*
2133 : * We have reached the end of base backup, the point where
2134 : * pg_backup_stop() was done. The data on disk is now consistent
2135 : * (assuming we have also reached minRecoveryPoint). Set
2136 : * backupEndPoint to the current LSN, so that the next call to
2137 : * CheckRecoveryConsistency() will notice it and do the
2138 : * end-of-backup processing.
2139 : */
2140 142 : elog(DEBUG1, "end of backup record reached");
2141 :
2142 142 : backupEndPoint = lsn;
2143 : }
2144 : else
2145 32 : elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2146 : LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2147 : }
2148 89336 : }
2149 :
2150 : /*
2151 : * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2152 : * directories.
2153 : *
2154 : * Replay of database creation XLOG records for databases that were later
2155 : * dropped can create fake directories in pg_tblspc. By the time consistency
2156 : * is reached these directories should have been removed; here we verify
2157 : * that this did indeed happen. This is to be called at the point where
2158 : * consistent state is reached.
2159 : *
2160 : * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2161 : * useful for testing purposes, and also allows for an escape hatch in case
2162 : * things go south.
2163 : */
2164 : static void
2165 230 : CheckTablespaceDirectory(void)
2166 : {
2167 : DIR *dir;
2168 : struct dirent *de;
2169 :
2170 230 : dir = AllocateDir(PG_TBLSPC_DIR);
2171 704 : while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2172 : {
2173 : char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2174 :
2175 : /* Skip entries of non-oid names */
2176 474 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2177 460 : continue;
2178 :
2179 14 : snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2180 :
2181 14 : if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2182 8 : ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2183 : (errcode(ERRCODE_DATA_CORRUPTED),
2184 : errmsg("unexpected directory entry \"%s\" found in %s",
2185 : de->d_name, PG_TBLSPC_DIR),
2186 : errdetail("All directory entries in %s/ should be symbolic links.",
2187 : PG_TBLSPC_DIR),
2188 : errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2189 : }
2190 230 : }
2191 :
2192 : /*
2193 : * Checks if recovery has reached a consistent state. When consistency is
2194 : * reached and we have a valid starting standby snapshot, tell postmaster
2195 : * that it can start accepting read-only connections.
2196 : */
2197 : static void
2198 5513110 : CheckRecoveryConsistency(void)
2199 : {
2200 : XLogRecPtr lastReplayedEndRecPtr;
2201 : TimeLineID lastReplayedTLI;
2202 :
2203 : /*
2204 : * During crash recovery, we don't reach a consistent state until we've
2205 : * replayed all the WAL.
2206 : */
2207 5513110 : if (!XLogRecPtrIsValid(minRecoveryPoint))
2208 517202 : return;
2209 :
2210 : Assert(InArchiveRecovery);
2211 :
2212 : /*
2213 : * assume that we are called in the startup process, and hence don't need
2214 : * a lock to read lastReplayedEndRecPtr
2215 : */
2216 4995908 : lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2217 4995908 : lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2218 :
2219 : /*
2220 : * Have we reached the point where our base backup was completed?
2221 : */
2222 4995908 : if (XLogRecPtrIsValid(backupEndPoint) &&
2223 206 : backupEndPoint <= lastReplayedEndRecPtr)
2224 : {
2225 146 : XLogRecPtr saveBackupStartPoint = backupStartPoint;
2226 146 : XLogRecPtr saveBackupEndPoint = backupEndPoint;
2227 :
2228 146 : elog(DEBUG1, "end of backup reached");
2229 :
2230 : /*
2231 : * We have reached the end of base backup, as indicated by pg_control.
2232 : * Update the control file accordingly.
2233 : */
2234 146 : ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2235 146 : backupStartPoint = InvalidXLogRecPtr;
2236 146 : backupEndPoint = InvalidXLogRecPtr;
2237 146 : backupEndRequired = false;
2238 :
2239 146 : ereport(LOG,
2240 : errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2241 : LSN_FORMAT_ARGS(saveBackupStartPoint),
2242 : LSN_FORMAT_ARGS(saveBackupEndPoint)));
2243 : }
2244 :
2245 : /*
2246 : * Have we passed our safe starting point? Note that minRecoveryPoint is
2247 : * known to be incorrectly set if recovering from a backup, until the
2248 : * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2249 : * All we know prior to that is that we're not consistent yet.
2250 : */
2251 4995908 : if (!reachedConsistency && !backupEndRequired &&
2252 15266 : minRecoveryPoint <= lastReplayedEndRecPtr)
2253 : {
2254 : /*
2255 : * Check to see if the XLOG sequence contained any unresolved
2256 : * references to uninitialized pages.
2257 : */
2258 230 : XLogCheckInvalidPages();
2259 :
2260 : /*
2261 : * Check that pg_tblspc doesn't contain any real directories. Replay
2262 : * of Database/CREATE_* records may have created fictitious tablespace
2263 : * directories that should have been removed by the time consistency
2264 : * was reached.
2265 : */
2266 230 : CheckTablespaceDirectory();
2267 :
2268 230 : reachedConsistency = true;
2269 230 : SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
2270 230 : ereport(LOG,
2271 : errmsg("consistent recovery state reached at %X/%08X",
2272 : LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2273 : }
2274 :
2275 : /*
2276 : * Have we got a valid starting snapshot that will allow queries to be
2277 : * run? If so, we can tell postmaster that the database is consistent now,
2278 : * enabling connections.
2279 : */
2280 4995908 : if (standbyState == STANDBY_SNAPSHOT_READY &&
2281 4995458 : !LocalHotStandbyActive &&
2282 214 : reachedConsistency &&
2283 : IsUnderPostmaster)
2284 : {
2285 214 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2286 214 : XLogRecoveryCtl->SharedHotStandbyActive = true;
2287 214 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2288 :
2289 214 : LocalHotStandbyActive = true;
2290 :
2291 214 : SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2292 : }
2293 : }
2294 :
2295 : /*
2296 : * Error context callback for errors occurring during rm_redo().
2297 : */
2298 : static void
2299 224 : rm_redo_error_callback(void *arg)
2300 : {
2301 224 : XLogReaderState *record = (XLogReaderState *) arg;
2302 : StringInfoData buf;
2303 :
2304 224 : initStringInfo(&buf);
2305 224 : xlog_outdesc(&buf, record);
2306 224 : xlog_block_info(&buf, record);
2307 :
2308 : /* translator: %s is a WAL record description */
2309 224 : errcontext("WAL redo at %X/%08X for %s",
2310 224 : LSN_FORMAT_ARGS(record->ReadRecPtr),
2311 : buf.data);
2312 :
2313 224 : pfree(buf.data);
2314 224 : }
2315 :
2316 : /*
2317 : * Returns a string describing an XLogRecord, consisting of its identity
2318 : * optionally followed by a colon, a space, and a further description.
2319 : */
2320 : void
2321 224 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
2322 : {
2323 224 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2324 224 : uint8 info = XLogRecGetInfo(record);
2325 : const char *id;
2326 :
2327 224 : appendStringInfoString(buf, rmgr.rm_name);
2328 224 : appendStringInfoChar(buf, '/');
2329 :
2330 224 : id = rmgr.rm_identify(info);
2331 224 : if (id == NULL)
2332 0 : appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2333 : else
2334 224 : appendStringInfo(buf, "%s: ", id);
2335 :
2336 224 : rmgr.rm_desc(buf, record);
2337 224 : }
2338 :
2339 : #ifdef WAL_DEBUG
2340 :
2341 : static void
2342 : xlog_outrec(StringInfo buf, XLogReaderState *record)
2343 : {
2344 : appendStringInfo(buf, "prev %X/%08X; xid %u",
2345 : LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2346 : XLogRecGetXid(record));
2347 :
2348 : appendStringInfo(buf, "; len %u",
2349 : XLogRecGetDataLen(record));
2350 :
2351 : xlog_block_info(buf, record);
2352 : }
2353 : #endif /* WAL_DEBUG */
2354 :
2355 : /*
2356 : * Returns a string giving information about all the blocks in an
2357 : * XLogRecord.
2358 : */
2359 : static void
2360 224 : xlog_block_info(StringInfo buf, XLogReaderState *record)
2361 : {
2362 : int block_id;
2363 :
2364 : /* decode block references */
2365 332 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2366 : {
2367 : RelFileLocator rlocator;
2368 : ForkNumber forknum;
2369 : BlockNumber blk;
2370 :
2371 108 : if (!XLogRecGetBlockTagExtended(record, block_id,
2372 : &rlocator, &forknum, &blk, NULL))
2373 0 : continue;
2374 :
2375 108 : if (forknum != MAIN_FORKNUM)
2376 6 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2377 : block_id,
2378 : rlocator.spcOid, rlocator.dbOid,
2379 : rlocator.relNumber,
2380 : forknum,
2381 : blk);
2382 : else
2383 102 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2384 : block_id,
2385 : rlocator.spcOid, rlocator.dbOid,
2386 : rlocator.relNumber,
2387 : blk);
2388 108 : if (XLogRecHasBlockImage(record, block_id))
2389 68 : appendStringInfoString(buf, " FPW");
2390 : }
2391 224 : }
2392 :
2393 :
2394 : /*
2395 : * Check that it's OK to switch to new timeline during recovery.
2396 : *
2397 : * 'lsn' is the address of the shutdown checkpoint record we're about to
2398 : * replay. (Currently, timeline can only change at a shutdown checkpoint).
2399 : */
2400 : static void
2401 22 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2402 : TimeLineID replayTLI)
2403 : {
2404 : /* Check that the record agrees on what the current (old) timeline is */
2405 22 : if (prevTLI != replayTLI)
2406 0 : ereport(PANIC,
2407 : (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2408 : prevTLI, replayTLI)));
2409 :
2410 : /*
2411 : * The new timeline better be in the list of timelines we expect to see,
2412 : * according to the timeline history. It should also not decrease.
2413 : */
2414 22 : if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2415 0 : ereport(PANIC,
2416 : (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2417 : newTLI, replayTLI)));
2418 :
2419 : /*
2420 : * If we have not yet reached min recovery point, and we're about to
2421 : * switch to a timeline greater than the timeline of the min recovery
2422 : * point: trouble. After switching to the new timeline, we could not
2423 : * possibly visit the min recovery point on the correct timeline anymore.
2424 : * This can happen if there is a newer timeline in the archive that
2425 : * branched before the timeline the min recovery point is on, and you
2426 : * attempt to do PITR to the new timeline.
2427 : */
2428 22 : if (XLogRecPtrIsValid(minRecoveryPoint) &&
2429 18 : lsn < minRecoveryPoint &&
2430 2 : newTLI > minRecoveryPointTLI)
2431 0 : ereport(PANIC,
2432 : errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2433 : newTLI,
2434 : LSN_FORMAT_ARGS(minRecoveryPoint),
2435 : minRecoveryPointTLI));
2436 :
2437 : /* Looks good */
2438 22 : }
2439 :
2440 :
2441 : /*
2442 : * Extract timestamp from WAL record.
2443 : *
2444 : * If the record contains a timestamp, returns true, and saves the timestamp
2445 : * in *recordXtime. If the record type has no timestamp, returns false.
2446 : * Currently, only transaction commit/abort records and restore points contain
2447 : * timestamps.
2448 : */
2449 : static bool
2450 86546 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2451 : {
2452 86546 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2453 86546 : uint8 xact_info = info & XLOG_XACT_OPMASK;
2454 86546 : uint8 rmid = XLogRecGetRmid(record);
2455 :
2456 86546 : if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2457 : {
2458 4 : *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2459 4 : return true;
2460 : }
2461 86542 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2462 : xact_info == XLOG_XACT_COMMIT_PREPARED))
2463 : {
2464 79346 : *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2465 79346 : return true;
2466 : }
2467 7196 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2468 : xact_info == XLOG_XACT_ABORT_PREPARED))
2469 : {
2470 7196 : *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2471 7196 : return true;
2472 : }
2473 0 : return false;
2474 : }
2475 :
2476 : /*
2477 : * Checks whether the current buffer page and backup page stored in the
2478 : * WAL record are consistent or not. Before comparing the two pages, a
2479 : * masking can be applied to the pages to ignore certain areas like hint bits,
2480 : * unused space between pd_lower and pd_upper among other things. This
2481 : * function should be called once WAL replay has been completed for a
2482 : * given record.
2483 : */
2484 : static void
2485 4347092 : verifyBackupPageConsistency(XLogReaderState *record)
2486 : {
2487 4347092 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2488 : RelFileLocator rlocator;
2489 : ForkNumber forknum;
2490 : BlockNumber blkno;
2491 : int block_id;
2492 :
2493 : /* Records with no backup blocks have no need for consistency checks. */
2494 4347092 : if (!XLogRecHasAnyBlockRefs(record))
2495 108 : return;
2496 :
2497 : Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2498 :
2499 9033668 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2500 : {
2501 : Buffer buf;
2502 : Page page;
2503 :
2504 4686684 : if (!XLogRecGetBlockTagExtended(record, block_id,
2505 : &rlocator, &forknum, &blkno, NULL))
2506 : {
2507 : /*
2508 : * WAL record doesn't contain a block reference with the given id.
2509 : * Do nothing.
2510 : */
2511 4058 : continue;
2512 : }
2513 :
2514 : Assert(XLogRecHasBlockImage(record, block_id));
2515 :
2516 4682626 : if (XLogRecBlockImageApply(record, block_id))
2517 : {
2518 : /*
2519 : * WAL record has already applied the page, so bypass the
2520 : * consistency check as that would result in comparing the full
2521 : * page stored in the record with itself.
2522 : */
2523 52246 : continue;
2524 : }
2525 :
2526 : /*
2527 : * Read the contents from the current buffer and store it in a
2528 : * temporary page.
2529 : */
2530 4630380 : buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2531 : RBM_NORMAL_NO_LOG,
2532 : InvalidBuffer);
2533 4630380 : if (!BufferIsValid(buf))
2534 0 : continue;
2535 :
2536 4630380 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2537 4630380 : page = BufferGetPage(buf);
2538 :
2539 : /*
2540 : * Take a copy of the local page where WAL has been applied to have a
2541 : * comparison base before masking it...
2542 : */
2543 4630380 : memcpy(replay_image_masked, page, BLCKSZ);
2544 :
2545 : /* No need for this page anymore now that a copy is in. */
2546 4630380 : UnlockReleaseBuffer(buf);
2547 :
2548 : /*
2549 : * If the block LSN is already ahead of this WAL record, we can't
2550 : * expect contents to match. This can happen if recovery is
2551 : * restarted.
2552 : */
2553 4630380 : if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2554 0 : continue;
2555 :
2556 : /*
2557 : * Read the contents from the backup copy, stored in WAL record and
2558 : * store it in a temporary page. There is no need to allocate a new
2559 : * page here, a local buffer is fine to hold its contents and a mask
2560 : * can be directly applied on it.
2561 : */
2562 4630380 : if (!RestoreBlockImage(record, block_id, primary_image_masked))
2563 0 : ereport(ERROR,
2564 : (errcode(ERRCODE_INTERNAL_ERROR),
2565 : errmsg_internal("%s", record->errormsg_buf)));
2566 :
2567 : /*
2568 : * If masking function is defined, mask both the primary and replay
2569 : * images
2570 : */
2571 4630380 : if (rmgr.rm_mask != NULL)
2572 : {
2573 4630380 : rmgr.rm_mask(replay_image_masked, blkno);
2574 4630380 : rmgr.rm_mask(primary_image_masked, blkno);
2575 : }
2576 :
2577 : /* Time to compare the primary and replay images. */
2578 4630380 : if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2579 : {
2580 0 : elog(FATAL,
2581 : "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2582 : rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2583 : forknum, blkno);
2584 : }
2585 : }
2586 : }
2587 :
2588 : /*
2589 : * For point-in-time recovery, this function decides whether we want to
2590 : * stop applying the XLOG before the current record.
2591 : *
2592 : * Returns true if we are stopping, false otherwise. If stopping, some
2593 : * information is saved in recoveryStopXid et al for use in annotating the
2594 : * new timeline's history file.
2595 : */
2596 : static bool
2597 5512684 : recoveryStopsBefore(XLogReaderState *record)
2598 : {
2599 5512684 : bool stopsHere = false;
2600 : uint8 xact_info;
2601 : bool isCommit;
2602 5512684 : TimestampTz recordXtime = 0;
2603 : TransactionId recordXid;
2604 :
2605 : /*
2606 : * Ignore recovery target settings when not in archive recovery (meaning
2607 : * we are in crash recovery).
2608 : */
2609 5512684 : if (!ArchiveRecoveryRequested)
2610 487582 : return false;
2611 :
2612 : /* Check if we should stop as soon as reaching consistency */
2613 5025102 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2614 : {
2615 0 : ereport(LOG,
2616 : (errmsg("recovery stopping after reaching consistency")));
2617 :
2618 0 : recoveryStopAfter = false;
2619 0 : recoveryStopXid = InvalidTransactionId;
2620 0 : recoveryStopLSN = InvalidXLogRecPtr;
2621 0 : recoveryStopTime = 0;
2622 0 : recoveryStopName[0] = '\0';
2623 0 : return true;
2624 : }
2625 :
2626 : /* Check if target LSN has been reached */
2627 5025102 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2628 16904 : !recoveryTargetInclusive &&
2629 840 : record->ReadRecPtr >= recoveryTargetLSN)
2630 : {
2631 4 : recoveryStopAfter = false;
2632 4 : recoveryStopXid = InvalidTransactionId;
2633 4 : recoveryStopLSN = record->ReadRecPtr;
2634 4 : recoveryStopTime = 0;
2635 4 : recoveryStopName[0] = '\0';
2636 4 : ereport(LOG,
2637 : errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2638 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2639 4 : return true;
2640 : }
2641 :
2642 : /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2643 5025098 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2644 4981242 : return false;
2645 :
2646 43856 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2647 :
2648 43856 : if (xact_info == XLOG_XACT_COMMIT)
2649 : {
2650 39608 : isCommit = true;
2651 39608 : recordXid = XLogRecGetXid(record);
2652 : }
2653 4248 : else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2654 : {
2655 58 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2656 : xl_xact_parsed_commit parsed;
2657 :
2658 58 : isCommit = true;
2659 58 : ParseCommitRecord(XLogRecGetInfo(record),
2660 : xlrec,
2661 : &parsed);
2662 58 : recordXid = parsed.twophase_xid;
2663 : }
2664 4190 : else if (xact_info == XLOG_XACT_ABORT)
2665 : {
2666 3572 : isCommit = false;
2667 3572 : recordXid = XLogRecGetXid(record);
2668 : }
2669 618 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2670 : {
2671 26 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2672 : xl_xact_parsed_abort parsed;
2673 :
2674 26 : isCommit = false;
2675 26 : ParseAbortRecord(XLogRecGetInfo(record),
2676 : xlrec,
2677 : &parsed);
2678 26 : recordXid = parsed.twophase_xid;
2679 : }
2680 : else
2681 592 : return false;
2682 :
2683 43264 : if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2684 : {
2685 : /*
2686 : * There can be only one transaction end record with this exact
2687 : * transactionid
2688 : *
2689 : * when testing for an xid, we MUST test for equality only, since
2690 : * transactions are numbered in the order they start, not the order
2691 : * they complete. A higher numbered xid will complete before you about
2692 : * 50% of the time...
2693 : */
2694 0 : stopsHere = (recordXid == recoveryTargetXid);
2695 : }
2696 :
2697 : /*
2698 : * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2699 : * We don't expect getRecordTimestamp ever to fail, since we already know
2700 : * this is a commit or abort record; but test its result anyway.
2701 : */
2702 43264 : if (getRecordTimestamp(record, &recordXtime) &&
2703 43264 : recoveryTarget == RECOVERY_TARGET_TIME)
2704 : {
2705 : /*
2706 : * There can be many transactions that share the same commit time, so
2707 : * we stop after the last one, if we are inclusive, or stop at the
2708 : * first one if we are exclusive
2709 : */
2710 0 : if (recoveryTargetInclusive)
2711 0 : stopsHere = (recordXtime > recoveryTargetTime);
2712 : else
2713 0 : stopsHere = (recordXtime >= recoveryTargetTime);
2714 : }
2715 :
2716 43264 : if (stopsHere)
2717 : {
2718 0 : recoveryStopAfter = false;
2719 0 : recoveryStopXid = recordXid;
2720 0 : recoveryStopTime = recordXtime;
2721 0 : recoveryStopLSN = InvalidXLogRecPtr;
2722 0 : recoveryStopName[0] = '\0';
2723 :
2724 0 : if (isCommit)
2725 : {
2726 0 : ereport(LOG,
2727 : (errmsg("recovery stopping before commit of transaction %u, time %s",
2728 : recoveryStopXid,
2729 : timestamptz_to_str(recoveryStopTime))));
2730 : }
2731 : else
2732 : {
2733 0 : ereport(LOG,
2734 : (errmsg("recovery stopping before abort of transaction %u, time %s",
2735 : recoveryStopXid,
2736 : timestamptz_to_str(recoveryStopTime))));
2737 : }
2738 : }
2739 :
2740 43264 : return stopsHere;
2741 : }
2742 :
2743 : /*
2744 : * Same as recoveryStopsBefore, but called after applying the record.
2745 : *
2746 : * We also track the timestamp of the latest applied COMMIT/ABORT
2747 : * record in XLogRecoveryCtl->recoveryLastXTime.
2748 : */
2749 : static bool
2750 5512676 : recoveryStopsAfter(XLogReaderState *record)
2751 : {
2752 : uint8 info;
2753 : uint8 xact_info;
2754 : uint8 rmid;
2755 5512676 : TimestampTz recordXtime = 0;
2756 :
2757 : /*
2758 : * Ignore recovery target settings when not in archive recovery (meaning
2759 : * we are in crash recovery).
2760 : */
2761 5512676 : if (!ArchiveRecoveryRequested)
2762 487582 : return false;
2763 :
2764 5025094 : info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2765 5025094 : rmid = XLogRecGetRmid(record);
2766 :
2767 : /*
2768 : * There can be many restore points that share the same name; we stop at
2769 : * the first one.
2770 : */
2771 5025094 : if (recoveryTarget == RECOVERY_TARGET_NAME &&
2772 40 : rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2773 : {
2774 : xl_restore_point *recordRestorePointData;
2775 :
2776 6 : recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2777 :
2778 6 : if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2779 : {
2780 4 : recoveryStopAfter = true;
2781 4 : recoveryStopXid = InvalidTransactionId;
2782 4 : recoveryStopLSN = InvalidXLogRecPtr;
2783 4 : (void) getRecordTimestamp(record, &recoveryStopTime);
2784 4 : strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2785 :
2786 4 : ereport(LOG,
2787 : (errmsg("recovery stopping at restore point \"%s\", time %s",
2788 : recoveryStopName,
2789 : timestamptz_to_str(recoveryStopTime))));
2790 4 : return true;
2791 : }
2792 : }
2793 :
2794 : /* Check if the target LSN has been reached */
2795 5025090 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2796 16064 : recoveryTargetInclusive &&
2797 16064 : record->ReadRecPtr >= recoveryTargetLSN)
2798 : {
2799 6 : recoveryStopAfter = true;
2800 6 : recoveryStopXid = InvalidTransactionId;
2801 6 : recoveryStopLSN = record->ReadRecPtr;
2802 6 : recoveryStopTime = 0;
2803 6 : recoveryStopName[0] = '\0';
2804 6 : ereport(LOG,
2805 : errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2806 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2807 6 : return true;
2808 : }
2809 :
2810 5025084 : if (rmid != RM_XACT_ID)
2811 4981232 : return false;
2812 :
2813 43852 : xact_info = info & XLOG_XACT_OPMASK;
2814 :
2815 43852 : if (xact_info == XLOG_XACT_COMMIT ||
2816 4190 : xact_info == XLOG_XACT_COMMIT_PREPARED ||
2817 618 : xact_info == XLOG_XACT_ABORT ||
2818 : xact_info == XLOG_XACT_ABORT_PREPARED)
2819 : {
2820 : TransactionId recordXid;
2821 :
2822 : /* Update the last applied transaction timestamp */
2823 43260 : if (getRecordTimestamp(record, &recordXtime))
2824 43260 : SetLatestXTime(recordXtime);
2825 :
2826 : /* Extract the XID of the committed/aborted transaction */
2827 43260 : if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2828 : {
2829 58 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2830 : xl_xact_parsed_commit parsed;
2831 :
2832 58 : ParseCommitRecord(XLogRecGetInfo(record),
2833 : xlrec,
2834 : &parsed);
2835 58 : recordXid = parsed.twophase_xid;
2836 : }
2837 43202 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2838 : {
2839 26 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2840 : xl_xact_parsed_abort parsed;
2841 :
2842 26 : ParseAbortRecord(XLogRecGetInfo(record),
2843 : xlrec,
2844 : &parsed);
2845 26 : recordXid = parsed.twophase_xid;
2846 : }
2847 : else
2848 43176 : recordXid = XLogRecGetXid(record);
2849 :
2850 : /*
2851 : * There can be only one transaction end record with this exact
2852 : * transactionid
2853 : *
2854 : * when testing for an xid, we MUST test for equality only, since
2855 : * transactions are numbered in the order they start, not the order
2856 : * they complete. A higher numbered xid will complete before you about
2857 : * 50% of the time...
2858 : */
2859 43260 : if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2860 0 : recordXid == recoveryTargetXid)
2861 : {
2862 0 : recoveryStopAfter = true;
2863 0 : recoveryStopXid = recordXid;
2864 0 : recoveryStopTime = recordXtime;
2865 0 : recoveryStopLSN = InvalidXLogRecPtr;
2866 0 : recoveryStopName[0] = '\0';
2867 :
2868 0 : if (xact_info == XLOG_XACT_COMMIT ||
2869 : xact_info == XLOG_XACT_COMMIT_PREPARED)
2870 : {
2871 0 : ereport(LOG,
2872 : (errmsg("recovery stopping after commit of transaction %u, time %s",
2873 : recoveryStopXid,
2874 : timestamptz_to_str(recoveryStopTime))));
2875 : }
2876 0 : else if (xact_info == XLOG_XACT_ABORT ||
2877 : xact_info == XLOG_XACT_ABORT_PREPARED)
2878 : {
2879 0 : ereport(LOG,
2880 : (errmsg("recovery stopping after abort of transaction %u, time %s",
2881 : recoveryStopXid,
2882 : timestamptz_to_str(recoveryStopTime))));
2883 : }
2884 0 : return true;
2885 : }
2886 : }
2887 :
2888 : /* Check if we should stop as soon as reaching consistency */
2889 43852 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2890 : {
2891 0 : ereport(LOG,
2892 : (errmsg("recovery stopping after reaching consistency")));
2893 :
2894 0 : recoveryStopAfter = true;
2895 0 : recoveryStopXid = InvalidTransactionId;
2896 0 : recoveryStopTime = 0;
2897 0 : recoveryStopLSN = InvalidXLogRecPtr;
2898 0 : recoveryStopName[0] = '\0';
2899 0 : return true;
2900 : }
2901 :
2902 43852 : return false;
2903 : }
2904 :
2905 : /*
2906 : * Create a comment for the history file to explain why and where
2907 : * timeline changed.
2908 : */
2909 : static char *
2910 1798 : getRecoveryStopReason(void)
2911 : {
2912 : char reason[200];
2913 :
2914 1798 : if (recoveryTarget == RECOVERY_TARGET_XID)
2915 0 : snprintf(reason, sizeof(reason),
2916 : "%s transaction %u",
2917 0 : recoveryStopAfter ? "after" : "before",
2918 : recoveryStopXid);
2919 1798 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
2920 0 : snprintf(reason, sizeof(reason),
2921 : "%s %s\n",
2922 0 : recoveryStopAfter ? "after" : "before",
2923 : timestamptz_to_str(recoveryStopTime));
2924 1798 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
2925 14 : snprintf(reason, sizeof(reason),
2926 : "%s LSN %X/%08X\n",
2927 14 : recoveryStopAfter ? "after" : "before",
2928 14 : LSN_FORMAT_ARGS(recoveryStopLSN));
2929 1784 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
2930 6 : snprintf(reason, sizeof(reason),
2931 : "at restore point \"%s\"",
2932 : recoveryStopName);
2933 1778 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2934 0 : snprintf(reason, sizeof(reason), "reached consistency");
2935 : else
2936 1778 : snprintf(reason, sizeof(reason), "no recovery target specified");
2937 :
2938 1798 : return pstrdup(reason);
2939 : }
2940 :
2941 : /*
2942 : * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2943 : *
2944 : * endOfRecovery is true if the recovery target is reached and
2945 : * the paused state starts at the end of recovery because of
2946 : * recovery_target_action=pause, and false otherwise.
2947 : */
2948 : static void
2949 6 : recoveryPausesHere(bool endOfRecovery)
2950 : {
2951 : /* Don't pause unless users can connect! */
2952 6 : if (!LocalHotStandbyActive)
2953 0 : return;
2954 :
2955 : /* Don't pause after standby promotion has been triggered */
2956 6 : if (LocalPromoteIsTriggered)
2957 0 : return;
2958 :
2959 6 : if (endOfRecovery)
2960 2 : ereport(LOG,
2961 : (errmsg("pausing at the end of recovery"),
2962 : errhint("Execute pg_wal_replay_resume() to promote.")));
2963 : else
2964 4 : ereport(LOG,
2965 : (errmsg("recovery has paused"),
2966 : errhint("Execute pg_wal_replay_resume() to continue.")));
2967 :
2968 : /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2969 18 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2970 : {
2971 16 : ProcessStartupProcInterrupts();
2972 16 : if (CheckForStandbyTrigger())
2973 4 : return;
2974 :
2975 : /*
2976 : * If recovery pause is requested then set it paused. While we are in
2977 : * the loop, user might resume and pause again so set this every time.
2978 : */
2979 12 : ConfirmRecoveryPaused();
2980 :
2981 : /*
2982 : * We wait on a condition variable that will wake us as soon as the
2983 : * pause ends, but we use a timeout so we can check the above exit
2984 : * condition periodically too.
2985 : */
2986 12 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2987 : WAIT_EVENT_RECOVERY_PAUSE);
2988 : }
2989 2 : ConditionVariableCancelSleep();
2990 : }
2991 :
2992 : /*
2993 : * When recovery_min_apply_delay is set, we wait long enough to make sure
2994 : * certain record types are applied at least that interval behind the primary.
2995 : *
2996 : * Returns true if we waited.
2997 : *
2998 : * Note that the delay is calculated between the WAL record log time and
2999 : * the current time on standby. We would prefer to keep track of when this
3000 : * standby received each WAL record, which would allow a more consistent
3001 : * approach and one not affected by time synchronisation issues, but that
3002 : * is significantly more effort and complexity for little actual gain in
3003 : * usability.
3004 : */
3005 : static bool
3006 5512680 : recoveryApplyDelay(XLogReaderState *record)
3007 : {
3008 : uint8 xact_info;
3009 : TimestampTz xtime;
3010 : TimestampTz delayUntil;
3011 : long msecs;
3012 :
3013 : /* nothing to do if no delay configured */
3014 5512680 : if (recovery_min_apply_delay <= 0)
3015 5512564 : return false;
3016 :
3017 : /* no delay is applied on a database not yet consistent */
3018 116 : if (!reachedConsistency)
3019 8 : return false;
3020 :
3021 : /* nothing to do if crash recovery is requested */
3022 108 : if (!ArchiveRecoveryRequested)
3023 0 : return false;
3024 :
3025 : /*
3026 : * Is it a COMMIT record?
3027 : *
3028 : * We deliberately choose not to delay aborts since they have no effect on
3029 : * MVCC. We already allow replay of records that don't have a timestamp,
3030 : * so there is already opportunity for issues caused by early conflicts on
3031 : * standbys.
3032 : */
3033 108 : if (XLogRecGetRmid(record) != RM_XACT_ID)
3034 90 : return false;
3035 :
3036 18 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3037 :
3038 18 : if (xact_info != XLOG_XACT_COMMIT &&
3039 : xact_info != XLOG_XACT_COMMIT_PREPARED)
3040 0 : return false;
3041 :
3042 18 : if (!getRecordTimestamp(record, &xtime))
3043 0 : return false;
3044 :
3045 18 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3046 :
3047 : /*
3048 : * Exit without arming the latch if it's already past time to apply this
3049 : * record
3050 : */
3051 18 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3052 18 : if (msecs <= 0)
3053 0 : return false;
3054 :
3055 : while (true)
3056 : {
3057 48 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3058 :
3059 : /* This might change recovery_min_apply_delay. */
3060 48 : ProcessStartupProcInterrupts();
3061 :
3062 48 : if (CheckForStandbyTrigger())
3063 0 : break;
3064 :
3065 : /*
3066 : * Recalculate delayUntil as recovery_min_apply_delay could have
3067 : * changed while waiting in this loop.
3068 : */
3069 48 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3070 :
3071 : /*
3072 : * Wait for difference between GetCurrentTimestamp() and delayUntil.
3073 : */
3074 48 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3075 : delayUntil);
3076 :
3077 48 : if (msecs <= 0)
3078 18 : break;
3079 :
3080 30 : elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3081 :
3082 30 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3083 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3084 : msecs,
3085 : WAIT_EVENT_RECOVERY_APPLY_DELAY);
3086 : }
3087 18 : return true;
3088 : }
3089 :
3090 : /*
3091 : * Get the current state of the recovery pause request.
3092 : */
3093 : RecoveryPauseState
3094 28 : GetRecoveryPauseState(void)
3095 : {
3096 : RecoveryPauseState state;
3097 :
3098 28 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3099 28 : state = XLogRecoveryCtl->recoveryPauseState;
3100 28 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3101 :
3102 28 : return state;
3103 : }
3104 :
3105 : /*
3106 : * Set the recovery pause state.
3107 : *
3108 : * If recovery pause is requested then sets the recovery pause state to
3109 : * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3110 : * to 'not paused' to resume the recovery. The recovery pause will be
3111 : * confirmed by the ConfirmRecoveryPaused.
3112 : */
3113 : void
3114 100 : SetRecoveryPause(bool recoveryPause)
3115 : {
3116 100 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3117 :
3118 100 : if (!recoveryPause)
3119 92 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3120 8 : else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3121 8 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3122 :
3123 100 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3124 :
3125 100 : if (!recoveryPause)
3126 92 : ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3127 100 : }
3128 :
3129 : /*
3130 : * Confirm the recovery pause by setting the recovery pause state to
3131 : * RECOVERY_PAUSED.
3132 : */
3133 : static void
3134 12 : ConfirmRecoveryPaused(void)
3135 : {
3136 : /* If recovery pause is requested then set it paused */
3137 12 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3138 12 : if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3139 6 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3140 12 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3141 12 : }
3142 :
3143 :
3144 : /*
3145 : * Attempt to read the next XLOG record.
3146 : *
3147 : * Before first call, the reader needs to be positioned to the first record
3148 : * by calling XLogPrefetcherBeginRead().
3149 : *
3150 : * If no valid record is available, returns NULL, or fails if emode is PANIC.
3151 : * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3152 : * record is available.
3153 : */
3154 : static XLogRecord *
3155 5516956 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3156 : bool fetching_ckpt, TimeLineID replayTLI)
3157 : {
3158 : XLogRecord *record;
3159 5516956 : XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3160 5516956 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3161 :
3162 : Assert(AmStartupProcess() || !IsUnderPostmaster);
3163 :
3164 : /* Pass through parameters to XLogPageRead */
3165 5516956 : private->fetching_ckpt = fetching_ckpt;
3166 5516956 : private->emode = emode;
3167 5516956 : private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr);
3168 5516956 : private->replayTLI = replayTLI;
3169 :
3170 : /* This is the first attempt to read this page. */
3171 5516956 : lastSourceFailed = false;
3172 :
3173 : for (;;)
3174 254 : {
3175 : char *errormsg;
3176 :
3177 5517210 : record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3178 5517100 : if (record == NULL)
3179 : {
3180 : /*
3181 : * When we find that WAL ends in an incomplete record, keep track
3182 : * of that record. After recovery is done, we'll write a record
3183 : * to indicate to downstream WAL readers that that portion is to
3184 : * be ignored.
3185 : *
3186 : * However, when ArchiveRecoveryRequested = true, we're going to
3187 : * switch to a new timeline at the end of recovery. We will only
3188 : * copy WAL over to the new timeline up to the end of the last
3189 : * complete record, so if we did this, we would later create an
3190 : * overwrite contrecord in the wrong place, breaking everything.
3191 : */
3192 556 : if (!ArchiveRecoveryRequested &&
3193 212 : XLogRecPtrIsValid(xlogreader->abortedRecPtr))
3194 : {
3195 22 : abortedRecPtr = xlogreader->abortedRecPtr;
3196 22 : missingContrecPtr = xlogreader->missingContrecPtr;
3197 : }
3198 :
3199 556 : if (readFile >= 0)
3200 : {
3201 514 : close(readFile);
3202 514 : readFile = -1;
3203 : }
3204 :
3205 : /*
3206 : * We only end up here without a message when XLogPageRead()
3207 : * failed - in that case we already logged something. In
3208 : * StandbyMode that only happens if we have been triggered, so we
3209 : * shouldn't loop anymore in that case.
3210 : */
3211 556 : if (errormsg)
3212 514 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3213 : (errmsg_internal("%s", errormsg) /* already translated */ ));
3214 : }
3215 :
3216 : /*
3217 : * Check page TLI is one of the expected values.
3218 : */
3219 5516544 : else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3220 : {
3221 : char fname[MAXFNAMELEN];
3222 : XLogSegNo segno;
3223 : int32 offset;
3224 :
3225 0 : XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3226 0 : offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3227 : wal_segment_size);
3228 0 : XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3229 : wal_segment_size);
3230 0 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3231 : errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3232 : xlogreader->latestPageTLI,
3233 : fname,
3234 : LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3235 : offset));
3236 0 : record = NULL;
3237 : }
3238 :
3239 5517100 : if (record)
3240 : {
3241 : /* Great, got a record */
3242 5516846 : return record;
3243 : }
3244 : else
3245 : {
3246 : /* No valid record available from this source */
3247 556 : lastSourceFailed = true;
3248 :
3249 : /*
3250 : * If archive recovery was requested, but we were still doing
3251 : * crash recovery, switch to archive recovery and retry using the
3252 : * offline archive. We have now replayed all the valid WAL in
3253 : * pg_wal, so we are presumably now consistent.
3254 : *
3255 : * We require that there's at least some valid WAL present in
3256 : * pg_wal, however (!fetching_ckpt). We could recover using the
3257 : * WAL from the archive, even if pg_wal is completely empty, but
3258 : * we'd have no idea how far we'd have to replay to reach
3259 : * consistency. So err on the safe side and give up.
3260 : */
3261 556 : if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3262 4 : !fetching_ckpt)
3263 : {
3264 4 : ereport(DEBUG1,
3265 : (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3266 4 : InArchiveRecovery = true;
3267 4 : if (StandbyModeRequested)
3268 4 : EnableStandbyMode();
3269 :
3270 4 : SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3271 4 : minRecoveryPoint = xlogreader->EndRecPtr;
3272 4 : minRecoveryPointTLI = replayTLI;
3273 :
3274 4 : CheckRecoveryConsistency();
3275 :
3276 : /*
3277 : * Before we retry, reset lastSourceFailed and currentSource
3278 : * so that we will check the archive next.
3279 : */
3280 4 : lastSourceFailed = false;
3281 4 : currentSource = XLOG_FROM_ANY;
3282 :
3283 254 : continue;
3284 : }
3285 :
3286 : /* In standby mode, loop back to retry. Otherwise, give up. */
3287 552 : if (StandbyMode && !CheckForStandbyTrigger())
3288 250 : continue;
3289 : else
3290 302 : return NULL;
3291 : }
3292 : }
3293 : }
3294 :
3295 : /*
3296 : * Read the XLOG page containing targetPagePtr into readBuf (if not read
3297 : * already). Returns number of bytes read, if the page is read successfully,
3298 : * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3299 : * but only if they have not been previously reported.
3300 : *
3301 : * See XLogReaderRoutine.page_read for more details.
3302 : *
3303 : * While prefetching, xlogreader->nonblocking may be set. In that case,
3304 : * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3305 : *
3306 : * This is responsible for restoring files from archive as needed, as well
3307 : * as for waiting for the requested WAL record to arrive in standby mode.
3308 : *
3309 : * xlogreader->private_data->emode specifies the log level used for reporting
3310 : * "file not found" or "end of WAL" situations in archive recovery, or in
3311 : * standby mode when promotion is triggered. If set to WARNING or below,
3312 : * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3313 : * levels the ereport() won't return.
3314 : *
3315 : * In standby mode, if after a successful return of XLogPageRead() the
3316 : * caller finds the record it's interested in to be broken, it should
3317 : * ereport the error with the level determined by
3318 : * emode_for_corrupt_record(), and then set lastSourceFailed
3319 : * and call XLogPageRead() again with the same arguments. This lets
3320 : * XLogPageRead() to try fetching the record from another source, or to
3321 : * sleep and retry.
3322 : */
3323 : static int
3324 2851940 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3325 : XLogRecPtr targetRecPtr, char *readBuf)
3326 : {
3327 2851940 : XLogPageReadPrivate *private =
3328 : (XLogPageReadPrivate *) xlogreader->private_data;
3329 2851940 : int emode = private->emode;
3330 : uint32 targetPageOff;
3331 : XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3332 : int r;
3333 : instr_time io_start;
3334 :
3335 : Assert(AmStartupProcess() || !IsUnderPostmaster);
3336 :
3337 2851940 : XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3338 2851940 : targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3339 :
3340 : /*
3341 : * See if we need to switch to a new segment because the requested record
3342 : * is not in the currently open one.
3343 : */
3344 2851940 : if (readFile >= 0 &&
3345 2848606 : !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3346 : {
3347 : /*
3348 : * Request a restartpoint if we've replayed too much xlog since the
3349 : * last one.
3350 : */
3351 2904 : if (ArchiveRecoveryRequested && IsUnderPostmaster)
3352 : {
3353 2874 : if (XLogCheckpointNeeded(readSegNo))
3354 : {
3355 2660 : (void) GetRedoRecPtr();
3356 2660 : if (XLogCheckpointNeeded(readSegNo))
3357 2648 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3358 : }
3359 : }
3360 :
3361 2904 : close(readFile);
3362 2904 : readFile = -1;
3363 2904 : readSource = XLOG_FROM_ANY;
3364 : }
3365 :
3366 2851940 : XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3367 :
3368 2851952 : retry:
3369 : /* See if we need to retrieve more data */
3370 2851952 : if (readFile < 0 ||
3371 2845702 : (readSource == XLOG_FROM_STREAM &&
3372 2821760 : flushedUpto < targetPagePtr + reqLen))
3373 : {
3374 27778 : if (readFile >= 0 &&
3375 21528 : xlogreader->nonblocking &&
3376 10542 : readSource == XLOG_FROM_STREAM &&
3377 10542 : flushedUpto < targetPagePtr + reqLen)
3378 10542 : return XLREAD_WOULDBLOCK;
3379 :
3380 17126 : switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3381 17236 : private->randAccess,
3382 17236 : private->fetching_ckpt,
3383 : targetRecPtr,
3384 : private->replayTLI,
3385 : xlogreader->EndRecPtr,
3386 17236 : xlogreader->nonblocking))
3387 : {
3388 826 : case XLREAD_WOULDBLOCK:
3389 826 : return XLREAD_WOULDBLOCK;
3390 80 : case XLREAD_FAIL:
3391 80 : if (readFile >= 0)
3392 0 : close(readFile);
3393 80 : readFile = -1;
3394 80 : readLen = 0;
3395 80 : readSource = XLOG_FROM_ANY;
3396 80 : return XLREAD_FAIL;
3397 16220 : case XLREAD_SUCCESS:
3398 16220 : break;
3399 : }
3400 : }
3401 :
3402 : /*
3403 : * At this point, we have the right segment open and if we're streaming we
3404 : * know the requested record is in it.
3405 : */
3406 : Assert(readFile != -1);
3407 :
3408 : /*
3409 : * If the current segment is being streamed from the primary, calculate
3410 : * how much of the current page we have received already. We know the
3411 : * requested record has been received, but this is for the benefit of
3412 : * future calls, to allow quick exit at the top of this function.
3413 : */
3414 2840394 : if (readSource == XLOG_FROM_STREAM)
3415 : {
3416 2813282 : if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3417 2805742 : readLen = XLOG_BLCKSZ;
3418 : else
3419 7540 : readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3420 : targetPageOff;
3421 : }
3422 : else
3423 27112 : readLen = XLOG_BLCKSZ;
3424 :
3425 : /* Read the requested page */
3426 2840394 : readOff = targetPageOff;
3427 :
3428 : /* Measure I/O timing when reading segment */
3429 2840394 : io_start = pgstat_prepare_io_time(track_wal_io_timing);
3430 :
3431 2840394 : pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3432 2840394 : r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff);
3433 2840394 : if (r != XLOG_BLCKSZ)
3434 : {
3435 : char fname[MAXFNAMELEN];
3436 0 : int save_errno = errno;
3437 :
3438 0 : pgstat_report_wait_end();
3439 :
3440 0 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3441 : io_start, 1, r);
3442 :
3443 0 : XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3444 0 : if (r < 0)
3445 : {
3446 0 : errno = save_errno;
3447 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3448 : (errcode_for_file_access(),
3449 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3450 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3451 : readOff)));
3452 : }
3453 : else
3454 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3455 : (errcode(ERRCODE_DATA_CORRUPTED),
3456 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3457 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3458 : readOff, r, (Size) XLOG_BLCKSZ)));
3459 0 : goto next_record_is_invalid;
3460 : }
3461 2840394 : pgstat_report_wait_end();
3462 :
3463 2840394 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3464 : io_start, 1, r);
3465 :
3466 : Assert(targetSegNo == readSegNo);
3467 : Assert(targetPageOff == readOff);
3468 : Assert(reqLen <= readLen);
3469 :
3470 2840394 : xlogreader->seg.ws_tli = curFileTLI;
3471 :
3472 : /*
3473 : * Check the page header immediately, so that we can retry immediately if
3474 : * it's not valid. This may seem unnecessary, because ReadPageInternal()
3475 : * validates the page header anyway, and would propagate the failure up to
3476 : * ReadRecord(), which would retry. However, there's a corner case with
3477 : * continuation records, if a record is split across two pages such that
3478 : * we would need to read the two pages from different sources across two
3479 : * WAL segments.
3480 : *
3481 : * The first page is only available locally, in pg_wal, because it's
3482 : * already been recycled on the primary. The second page, however, is not
3483 : * present in pg_wal, and we should stream it from the primary. There is a
3484 : * recycled WAL segment present in pg_wal, with garbage contents, however.
3485 : * We would read the first page from the local WAL segment, but when
3486 : * reading the second page, we would read the bogus, recycled, WAL
3487 : * segment. If we didn't catch that case here, we would never recover,
3488 : * because ReadRecord() would retry reading the whole record from the
3489 : * beginning.
3490 : *
3491 : * Of course, this only catches errors in the page header, which is what
3492 : * happens in the case of a recycled WAL segment. Other kinds of errors or
3493 : * corruption still has the same problem. But this at least fixes the
3494 : * common case, which can happen as part of normal operation.
3495 : *
3496 : * Validating the page header is cheap enough that doing it twice
3497 : * shouldn't be a big deal from a performance point of view.
3498 : *
3499 : * When not in standby mode, an invalid page header should cause recovery
3500 : * to end, not retry reading the page, so we don't need to validate the
3501 : * page header here for the retry. Instead, ReadPageInternal() is
3502 : * responsible for the validation.
3503 : */
3504 2840394 : if (StandbyMode &&
3505 2820646 : (targetPagePtr % wal_segment_size) == 0 &&
3506 2742 : !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3507 : {
3508 : /*
3509 : * Emit this error right now then retry this page immediately. Use
3510 : * errmsg_internal() because the message was already translated.
3511 : */
3512 14 : if (xlogreader->errormsg_buf[0])
3513 14 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3514 : (errmsg_internal("%s", xlogreader->errormsg_buf)));
3515 :
3516 : /* reset any error XLogReaderValidatePageHeader() might have set */
3517 14 : XLogReaderResetError(xlogreader);
3518 14 : goto next_record_is_invalid;
3519 : }
3520 :
3521 2840380 : return readLen;
3522 :
3523 14 : next_record_is_invalid:
3524 :
3525 : /*
3526 : * If we're reading ahead, give up fast. Retries and error reporting will
3527 : * be handled by a later read when recovery catches up to this point.
3528 : */
3529 14 : if (xlogreader->nonblocking)
3530 2 : return XLREAD_WOULDBLOCK;
3531 :
3532 12 : lastSourceFailed = true;
3533 :
3534 12 : if (readFile >= 0)
3535 12 : close(readFile);
3536 12 : readFile = -1;
3537 12 : readLen = 0;
3538 12 : readSource = XLOG_FROM_ANY;
3539 :
3540 : /* In standby-mode, keep trying */
3541 12 : if (StandbyMode)
3542 12 : goto retry;
3543 : else
3544 0 : return XLREAD_FAIL;
3545 : }
3546 :
3547 : /*
3548 : * Open the WAL segment containing WAL location 'RecPtr'.
3549 : *
3550 : * The segment can be fetched via restore_command, or via walreceiver having
3551 : * streamed the record, or it can already be present in pg_wal. Checking
3552 : * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3553 : * too, in case someone copies a new segment directly to pg_wal. That is not
3554 : * documented or recommended, though.
3555 : *
3556 : * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3557 : * prepare to read WAL starting from RedoStartLSN after this.
3558 : *
3559 : * 'RecPtr' might not point to the beginning of the record we're interested
3560 : * in, it might also point to the page or segment header. In that case,
3561 : * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3562 : * used to decide which timeline to stream the requested WAL from.
3563 : *
3564 : * 'replayLSN' is the current replay LSN, so that if we scan for new
3565 : * timelines, we can reject a switch to a timeline that branched off before
3566 : * this point.
3567 : *
3568 : * If the record is not immediately available, the function returns false
3569 : * if we're not in standby mode. In standby mode, waits for it to become
3570 : * available.
3571 : *
3572 : * When the requested record becomes available, the function opens the file
3573 : * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3574 : * of standby mode is triggered by the user, and there is no more WAL
3575 : * available, returns XLREAD_FAIL.
3576 : *
3577 : * If nonblocking is true, then give up immediately if we can't satisfy the
3578 : * request, returning XLREAD_WOULDBLOCK instead of waiting.
3579 : */
3580 : static XLogPageReadResult
3581 17236 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3582 : bool fetching_ckpt, XLogRecPtr tliRecPtr,
3583 : TimeLineID replayTLI, XLogRecPtr replayLSN,
3584 : bool nonblocking)
3585 : {
3586 : static TimestampTz last_fail_time = 0;
3587 : TimestampTz now;
3588 17236 : bool streaming_reply_sent = false;
3589 :
3590 : /*-------
3591 : * Standby mode is implemented by a state machine:
3592 : *
3593 : * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3594 : * pg_wal (XLOG_FROM_PG_WAL)
3595 : * 2. Check for promotion trigger request
3596 : * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3597 : * 4. Rescan timelines
3598 : * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3599 : *
3600 : * Failure to read from the current source advances the state machine to
3601 : * the next state.
3602 : *
3603 : * 'currentSource' indicates the current state. There are no currentSource
3604 : * values for "check trigger", "rescan timelines", and "sleep" states,
3605 : * those actions are taken when reading from the previous source fails, as
3606 : * part of advancing to the next state.
3607 : *
3608 : * If standby mode is turned off while reading WAL from stream, we move
3609 : * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3610 : * the files (which would be required at end of recovery, e.g., timeline
3611 : * history file) from archive or pg_wal. We don't need to kill WAL receiver
3612 : * here because it's already stopped when standby mode is turned off at
3613 : * the end of recovery.
3614 : *-------
3615 : */
3616 17236 : if (!InArchiveRecovery)
3617 1918 : currentSource = XLOG_FROM_PG_WAL;
3618 15318 : else if (currentSource == XLOG_FROM_ANY ||
3619 15086 : (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3620 : {
3621 232 : lastSourceFailed = false;
3622 232 : currentSource = XLOG_FROM_ARCHIVE;
3623 : }
3624 :
3625 : for (;;)
3626 14752 : {
3627 31988 : XLogSource oldSource = currentSource;
3628 31988 : bool startWalReceiver = false;
3629 :
3630 : /*
3631 : * First check if we failed to read from the current source, and
3632 : * advance the state machine if so. The failure to read might've
3633 : * happened outside this function, e.g when a CRC check fails on a
3634 : * record, or within this loop.
3635 : */
3636 31988 : if (lastSourceFailed)
3637 : {
3638 : /*
3639 : * Don't allow any retry loops to occur during nonblocking
3640 : * readahead. Let the caller process everything that has been
3641 : * decoded already first.
3642 : */
3643 960 : if (nonblocking)
3644 144 : return XLREAD_WOULDBLOCK;
3645 :
3646 816 : switch (currentSource)
3647 : {
3648 492 : case XLOG_FROM_ARCHIVE:
3649 : case XLOG_FROM_PG_WAL:
3650 :
3651 : /*
3652 : * Check to see if promotion is requested. Note that we do
3653 : * this only after failure, so when you promote, we still
3654 : * finish replaying as much as we can from archive and
3655 : * pg_wal before failover.
3656 : */
3657 492 : if (StandbyMode && CheckForStandbyTrigger())
3658 : {
3659 40 : XLogShutdownWalRcv();
3660 40 : return XLREAD_FAIL;
3661 : }
3662 :
3663 : /*
3664 : * Not in standby mode, and we've now tried the archive
3665 : * and pg_wal.
3666 : */
3667 452 : if (!StandbyMode)
3668 40 : return XLREAD_FAIL;
3669 :
3670 : /*
3671 : * Move to XLOG_FROM_STREAM state, and set to start a
3672 : * walreceiver if necessary.
3673 : */
3674 412 : currentSource = XLOG_FROM_STREAM;
3675 412 : startWalReceiver = true;
3676 412 : break;
3677 :
3678 324 : case XLOG_FROM_STREAM:
3679 :
3680 : /*
3681 : * Failure while streaming. Most likely, we got here
3682 : * because streaming replication was terminated, or
3683 : * promotion was triggered. But we also get here if we
3684 : * find an invalid record in the WAL streamed from the
3685 : * primary, in which case something is seriously wrong.
3686 : * There's little chance that the problem will just go
3687 : * away, but PANIC is not good for availability either,
3688 : * especially in hot standby mode. So, we treat that the
3689 : * same as disconnection, and retry from archive/pg_wal
3690 : * again. The WAL in the archive should be identical to
3691 : * what was streamed, so it's unlikely that it helps, but
3692 : * one can hope...
3693 : */
3694 :
3695 : /*
3696 : * We should be able to move to XLOG_FROM_STREAM only in
3697 : * standby mode.
3698 : */
3699 : Assert(StandbyMode);
3700 :
3701 : /*
3702 : * Before we leave XLOG_FROM_STREAM state, make sure that
3703 : * walreceiver is not active, so that it won't overwrite
3704 : * WAL that we restore from archive.
3705 : *
3706 : * If walreceiver is actively streaming (or attempting to
3707 : * connect), we must shut it down. However, if it's
3708 : * already in WAITING state (e.g., due to timeline
3709 : * divergence), we only need to reset the install flag to
3710 : * allow archive restoration.
3711 : */
3712 324 : if (WalRcvStreaming())
3713 64 : XLogShutdownWalRcv();
3714 : else
3715 : {
3716 : /*
3717 : * WALRCV_STOPPING state is a transient state while
3718 : * the startup process is in ShutdownWalRcv(). It
3719 : * should never appear here since we would be waiting
3720 : * for the walreceiver to reach WALRCV_STOPPED in that
3721 : * case.
3722 : */
3723 : Assert(WalRcvGetState() != WALRCV_STOPPING);
3724 260 : ResetInstallXLogFileSegmentActive();
3725 : }
3726 :
3727 : /*
3728 : * Before we sleep, re-scan for possible new timelines if
3729 : * we were requested to recover to the latest timeline.
3730 : */
3731 324 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3732 : {
3733 324 : if (rescanLatestTimeLine(replayTLI, replayLSN))
3734 : {
3735 12 : currentSource = XLOG_FROM_ARCHIVE;
3736 12 : break;
3737 : }
3738 : }
3739 :
3740 : /*
3741 : * XLOG_FROM_STREAM is the last state in our state
3742 : * machine, so we've exhausted all the options for
3743 : * obtaining the requested WAL. We're going to loop back
3744 : * and retry from the archive, but if it hasn't been long
3745 : * since last attempt, sleep wal_retrieve_retry_interval
3746 : * milliseconds to avoid busy-waiting.
3747 : */
3748 312 : now = GetCurrentTimestamp();
3749 312 : if (!TimestampDifferenceExceeds(last_fail_time, now,
3750 : wal_retrieve_retry_interval))
3751 : {
3752 : long wait_time;
3753 :
3754 332 : wait_time = wal_retrieve_retry_interval -
3755 166 : TimestampDifferenceMilliseconds(last_fail_time, now);
3756 :
3757 166 : elog(LOG, "waiting for WAL to become available at %X/%08X",
3758 : LSN_FORMAT_ARGS(RecPtr));
3759 :
3760 : /* Do background tasks that might benefit us later. */
3761 166 : KnownAssignedTransactionIdsIdleMaintenance();
3762 :
3763 166 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3764 : WL_LATCH_SET | WL_TIMEOUT |
3765 : WL_EXIT_ON_PM_DEATH,
3766 : wait_time,
3767 : WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3768 166 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3769 166 : now = GetCurrentTimestamp();
3770 :
3771 : /* Handle interrupt signals of startup process */
3772 166 : ProcessStartupProcInterrupts();
3773 : }
3774 286 : last_fail_time = now;
3775 286 : currentSource = XLOG_FROM_ARCHIVE;
3776 286 : break;
3777 :
3778 0 : default:
3779 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3780 : }
3781 : }
3782 31028 : else if (currentSource == XLOG_FROM_PG_WAL)
3783 : {
3784 : /*
3785 : * We just successfully read a file in pg_wal. We prefer files in
3786 : * the archive over ones in pg_wal, so try the next file again
3787 : * from the archive first.
3788 : */
3789 1914 : if (InArchiveRecovery)
3790 0 : currentSource = XLOG_FROM_ARCHIVE;
3791 : }
3792 :
3793 31738 : if (currentSource != oldSource)
3794 710 : elog(DEBUG2, "switched WAL source from %s to %s after %s",
3795 : xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3796 : lastSourceFailed ? "failure" : "success");
3797 :
3798 : /*
3799 : * We've now handled possible failure. Try to read from the chosen
3800 : * source.
3801 : */
3802 31738 : lastSourceFailed = false;
3803 :
3804 31738 : switch (currentSource)
3805 : {
3806 3506 : case XLOG_FROM_ARCHIVE:
3807 : case XLOG_FROM_PG_WAL:
3808 :
3809 : /*
3810 : * WAL receiver must not be running when reading WAL from
3811 : * archive or pg_wal.
3812 : */
3813 : Assert(!WalRcvStreaming());
3814 :
3815 : /* Close any old file we might have open. */
3816 3506 : if (readFile >= 0)
3817 : {
3818 164 : close(readFile);
3819 164 : readFile = -1;
3820 : }
3821 : /* Reset curFileTLI if random fetch. */
3822 3506 : if (randAccess)
3823 2222 : curFileTLI = 0;
3824 :
3825 : /*
3826 : * Try to restore the file from archive, or read an existing
3827 : * file from pg_wal.
3828 : */
3829 3506 : readFile = XLogFileReadAnyTLI(readSegNo,
3830 3506 : currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3831 : currentSource);
3832 3506 : if (readFile >= 0)
3833 3170 : return XLREAD_SUCCESS; /* success! */
3834 :
3835 : /*
3836 : * Nope, not found in archive or pg_wal.
3837 : */
3838 336 : lastSourceFailed = true;
3839 336 : break;
3840 :
3841 28232 : case XLOG_FROM_STREAM:
3842 : {
3843 : bool havedata;
3844 :
3845 : /*
3846 : * We should be able to move to XLOG_FROM_STREAM only in
3847 : * standby mode.
3848 : */
3849 : Assert(StandbyMode);
3850 :
3851 : /*
3852 : * First, shutdown walreceiver if its restart has been
3853 : * requested -- but no point if we're already slated for
3854 : * starting it.
3855 : */
3856 28232 : if (pendingWalRcvRestart && !startWalReceiver)
3857 : {
3858 8 : XLogShutdownWalRcv();
3859 :
3860 : /*
3861 : * Re-scan for possible new timelines if we were
3862 : * requested to recover to the latest timeline.
3863 : */
3864 8 : if (recoveryTargetTimeLineGoal ==
3865 : RECOVERY_TARGET_TIMELINE_LATEST)
3866 8 : rescanLatestTimeLine(replayTLI, replayLSN);
3867 :
3868 8 : startWalReceiver = true;
3869 : }
3870 28232 : pendingWalRcvRestart = false;
3871 :
3872 : /*
3873 : * Launch walreceiver if needed.
3874 : *
3875 : * If fetching_ckpt is true, RecPtr points to the initial
3876 : * checkpoint location. In that case, we use RedoStartLSN
3877 : * as the streaming start position instead of RecPtr, so
3878 : * that when we later jump backwards to start redo at
3879 : * RedoStartLSN, we will have the logs streamed already.
3880 : */
3881 28232 : if (startWalReceiver &&
3882 420 : PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3883 : {
3884 : XLogRecPtr ptr;
3885 : TimeLineID tli;
3886 :
3887 370 : if (fetching_ckpt)
3888 : {
3889 0 : ptr = RedoStartLSN;
3890 0 : tli = RedoStartTLI;
3891 : }
3892 : else
3893 : {
3894 370 : ptr = RecPtr;
3895 :
3896 : /*
3897 : * Use the record begin position to determine the
3898 : * TLI, rather than the position we're reading.
3899 : */
3900 370 : tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3901 :
3902 370 : if (curFileTLI > 0 && tli < curFileTLI)
3903 0 : elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3904 : LSN_FORMAT_ARGS(tliRecPtr),
3905 : tli, curFileTLI);
3906 : }
3907 370 : curFileTLI = tli;
3908 370 : SetInstallXLogFileSegmentActive();
3909 370 : RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3910 : PrimarySlotName,
3911 : wal_receiver_create_temp_slot);
3912 370 : flushedUpto = 0;
3913 : }
3914 :
3915 : /*
3916 : * Check if WAL receiver is active or wait to start up.
3917 : */
3918 28232 : if (!WalRcvStreaming())
3919 : {
3920 260 : lastSourceFailed = true;
3921 260 : break;
3922 : }
3923 :
3924 : /*
3925 : * Walreceiver is active, so see if new data has arrived.
3926 : *
3927 : * We only advance XLogReceiptTime when we obtain fresh
3928 : * WAL from walreceiver and observe that we had already
3929 : * processed everything before the most recent "chunk"
3930 : * that it flushed to disk. In steady state where we are
3931 : * keeping up with the incoming data, XLogReceiptTime will
3932 : * be updated on each cycle. When we are behind,
3933 : * XLogReceiptTime will not advance, so the grace time
3934 : * allotted to conflicting queries will decrease.
3935 : */
3936 27972 : if (RecPtr < flushedUpto)
3937 3446 : havedata = true;
3938 : else
3939 : {
3940 : XLogRecPtr latestChunkStart;
3941 :
3942 24526 : flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3943 24526 : if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3944 : {
3945 11892 : havedata = true;
3946 11892 : if (latestChunkStart <= RecPtr)
3947 : {
3948 8760 : XLogReceiptTime = GetCurrentTimestamp();
3949 8760 : SetCurrentChunkStartTime(XLogReceiptTime);
3950 : }
3951 : }
3952 : else
3953 12634 : havedata = false;
3954 : }
3955 27972 : if (havedata)
3956 : {
3957 : /*
3958 : * Great, streamed far enough. Open the file if it's
3959 : * not open already. Also read the timeline history
3960 : * file if we haven't initialized timeline history
3961 : * yet; it should be streamed over and present in
3962 : * pg_wal by now. Use XLOG_FROM_STREAM so that source
3963 : * info is set correctly and XLogReceiptTime isn't
3964 : * changed.
3965 : *
3966 : * NB: We must set readTimeLineHistory based on
3967 : * recoveryTargetTLI, not receiveTLI. Normally they'll
3968 : * be the same, but if recovery_target_timeline is
3969 : * 'latest' and archiving is configured, then it's
3970 : * possible that we managed to retrieve one or more
3971 : * new timeline history files from the archive,
3972 : * updating recoveryTargetTLI.
3973 : */
3974 15338 : if (readFile < 0)
3975 : {
3976 2288 : if (!expectedTLEs)
3977 0 : expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3978 2288 : readFile = XLogFileRead(readSegNo, receiveTLI,
3979 : XLOG_FROM_STREAM, false);
3980 : Assert(readFile >= 0);
3981 : }
3982 : else
3983 : {
3984 : /* just make sure source info is correct... */
3985 13050 : readSource = XLOG_FROM_STREAM;
3986 13050 : XLogReceiptSource = XLOG_FROM_STREAM;
3987 13050 : return XLREAD_SUCCESS;
3988 : }
3989 2288 : break;
3990 : }
3991 :
3992 : /* In nonblocking mode, return rather than sleeping. */
3993 12634 : if (nonblocking)
3994 682 : return XLREAD_WOULDBLOCK;
3995 :
3996 : /*
3997 : * Data not here yet. Check for trigger, then wait for
3998 : * walreceiver to wake us up when new WAL arrives.
3999 : */
4000 11952 : if (CheckForStandbyTrigger())
4001 : {
4002 : /*
4003 : * Note that we don't return XLREAD_FAIL immediately
4004 : * here. After being triggered, we still want to
4005 : * replay all the WAL that was already streamed. It's
4006 : * in pg_wal now, so we just treat this as a failure,
4007 : * and the state machine will move on to replay the
4008 : * streamed WAL from pg_wal, and then recheck the
4009 : * trigger and exit replay.
4010 : */
4011 64 : lastSourceFailed = true;
4012 64 : break;
4013 : }
4014 :
4015 : /*
4016 : * Since we have replayed everything we have received so
4017 : * far and are about to start waiting for more WAL, let's
4018 : * tell the upstream server our replay location now so
4019 : * that pg_stat_replication doesn't show stale
4020 : * information.
4021 : */
4022 11888 : if (!streaming_reply_sent)
4023 : {
4024 9322 : WalRcvForceReply();
4025 9322 : streaming_reply_sent = true;
4026 : }
4027 :
4028 : /* Do any background tasks that might benefit us later. */
4029 11888 : KnownAssignedTransactionIdsIdleMaintenance();
4030 :
4031 : /* Update pg_stat_recovery_prefetch before sleeping. */
4032 11888 : XLogPrefetcherComputeStats(xlogprefetcher);
4033 :
4034 : /*
4035 : * Wait for more WAL to arrive, when we will be woken
4036 : * immediately by the WAL receiver.
4037 : */
4038 11888 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
4039 : WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
4040 : -1L,
4041 : WAIT_EVENT_RECOVERY_WAL_STREAM);
4042 11888 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4043 11888 : break;
4044 : }
4045 :
4046 0 : default:
4047 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
4048 : }
4049 :
4050 : /*
4051 : * Check for recovery pause here so that we can confirm more quickly
4052 : * that a requested pause has actually taken effect.
4053 : */
4054 14836 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4055 : RECOVERY_NOT_PAUSED)
4056 4 : recoveryPausesHere(false);
4057 :
4058 : /*
4059 : * This possibly-long loop needs to handle interrupts of startup
4060 : * process.
4061 : */
4062 14836 : ProcessStartupProcInterrupts();
4063 : }
4064 :
4065 : return XLREAD_FAIL; /* not reached */
4066 : }
4067 :
4068 :
4069 : /*
4070 : * Determine what log level should be used to report a corrupt WAL record
4071 : * in the current WAL page, previously read by XLogPageRead().
4072 : *
4073 : * 'emode' is the error mode that would be used to report a file-not-found
4074 : * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4075 : * we're retrying the exact same record that we've tried previously, only
4076 : * complain the first time to keep the noise down. However, we only do when
4077 : * reading from pg_wal, because we don't expect any invalid records in archive
4078 : * or in records streamed from the primary. Files in the archive should be complete,
4079 : * and we should never hit the end of WAL because we stop and wait for more WAL
4080 : * to arrive before replaying it.
4081 : *
4082 : * NOTE: This function remembers the RecPtr value it was last called with,
4083 : * to suppress repeated messages about the same record. Only call this when
4084 : * you are about to ereport(), or you might cause a later message to be
4085 : * erroneously suppressed.
4086 : */
4087 : static int
4088 528 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4089 : {
4090 : static XLogRecPtr lastComplaint = 0;
4091 :
4092 528 : if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4093 : {
4094 520 : if (RecPtr == lastComplaint)
4095 138 : emode = DEBUG1;
4096 : else
4097 382 : lastComplaint = RecPtr;
4098 : }
4099 528 : return emode;
4100 : }
4101 :
4102 :
4103 : /*
4104 : * Subroutine to try to fetch and validate a prior checkpoint record.
4105 : */
4106 : static XLogRecord *
4107 1916 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4108 : TimeLineID replayTLI)
4109 : {
4110 : XLogRecord *record;
4111 : uint8 info;
4112 :
4113 : Assert(xlogreader != NULL);
4114 :
4115 1916 : if (!XRecOffIsValid(RecPtr))
4116 : {
4117 0 : ereport(LOG,
4118 : (errmsg("invalid checkpoint location")));
4119 0 : return NULL;
4120 : }
4121 :
4122 1916 : XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4123 1916 : record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4124 :
4125 1916 : if (record == NULL)
4126 : {
4127 0 : ereport(LOG,
4128 : (errmsg("invalid checkpoint record")));
4129 0 : return NULL;
4130 : }
4131 1916 : if (record->xl_rmid != RM_XLOG_ID)
4132 : {
4133 0 : ereport(LOG,
4134 : (errmsg("invalid resource manager ID in checkpoint record")));
4135 0 : return NULL;
4136 : }
4137 1916 : info = record->xl_info & ~XLR_INFO_MASK;
4138 1916 : if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4139 : info != XLOG_CHECKPOINT_ONLINE)
4140 : {
4141 0 : ereport(LOG,
4142 : (errmsg("invalid xl_info in checkpoint record")));
4143 0 : return NULL;
4144 : }
4145 1916 : if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4146 : {
4147 0 : ereport(LOG,
4148 : (errmsg("invalid length of checkpoint record")));
4149 0 : return NULL;
4150 : }
4151 1916 : return record;
4152 : }
4153 :
4154 : /*
4155 : * Scan for new timelines that might have appeared in the archive since we
4156 : * started recovery.
4157 : *
4158 : * If there are any, the function changes recovery target TLI to the latest
4159 : * one and returns 'true'.
4160 : */
4161 : static bool
4162 332 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4163 : {
4164 : List *newExpectedTLEs;
4165 : bool found;
4166 : ListCell *cell;
4167 : TimeLineID newtarget;
4168 332 : TimeLineID oldtarget = recoveryTargetTLI;
4169 332 : TimeLineHistoryEntry *currentTle = NULL;
4170 :
4171 332 : newtarget = findNewestTimeLine(recoveryTargetTLI);
4172 332 : if (newtarget == recoveryTargetTLI)
4173 : {
4174 : /* No new timelines found */
4175 320 : return false;
4176 : }
4177 :
4178 : /*
4179 : * Determine the list of expected TLIs for the new TLI
4180 : */
4181 :
4182 12 : newExpectedTLEs = readTimeLineHistory(newtarget);
4183 :
4184 : /*
4185 : * If the current timeline is not part of the history of the new timeline,
4186 : * we cannot proceed to it.
4187 : */
4188 12 : found = false;
4189 24 : foreach(cell, newExpectedTLEs)
4190 : {
4191 24 : currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4192 :
4193 24 : if (currentTle->tli == recoveryTargetTLI)
4194 : {
4195 12 : found = true;
4196 12 : break;
4197 : }
4198 : }
4199 12 : if (!found)
4200 : {
4201 0 : ereport(LOG,
4202 : (errmsg("new timeline %u is not a child of database system timeline %u",
4203 : newtarget,
4204 : replayTLI)));
4205 0 : return false;
4206 : }
4207 :
4208 : /*
4209 : * The current timeline was found in the history file, but check that the
4210 : * next timeline was forked off from it *after* the current recovery
4211 : * location.
4212 : */
4213 12 : if (currentTle->end < replayLSN)
4214 : {
4215 0 : ereport(LOG,
4216 : errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4217 : newtarget,
4218 : replayTLI,
4219 : LSN_FORMAT_ARGS(replayLSN)));
4220 0 : return false;
4221 : }
4222 :
4223 : /* The new timeline history seems valid. Switch target */
4224 12 : recoveryTargetTLI = newtarget;
4225 12 : list_free_deep(expectedTLEs);
4226 12 : expectedTLEs = newExpectedTLEs;
4227 :
4228 : /*
4229 : * As in StartupXLOG(), try to ensure we have all the history files
4230 : * between the old target and new target in pg_wal.
4231 : */
4232 12 : restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4233 :
4234 12 : ereport(LOG,
4235 : (errmsg("new target timeline is %u",
4236 : recoveryTargetTLI)));
4237 :
4238 12 : return true;
4239 : }
4240 :
4241 :
4242 : /*
4243 : * Open a logfile segment for reading (during recovery).
4244 : *
4245 : * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4246 : * Otherwise, it's assumed to be already available in pg_wal.
4247 : */
4248 : static int
4249 6694 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
4250 : XLogSource source, bool notfoundOk)
4251 : {
4252 : char xlogfname[MAXFNAMELEN];
4253 : char activitymsg[MAXFNAMELEN + 16];
4254 : char path[MAXPGPATH];
4255 : int fd;
4256 :
4257 6694 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
4258 :
4259 6694 : switch (source)
4260 : {
4261 1608 : case XLOG_FROM_ARCHIVE:
4262 : /* Report recovery progress in PS display */
4263 1608 : snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4264 : xlogfname);
4265 1608 : set_ps_display(activitymsg);
4266 :
4267 1608 : if (!RestoreArchivedFile(path, xlogfname,
4268 : "RECOVERYXLOG",
4269 : wal_segment_size,
4270 : InRedo))
4271 884 : return -1;
4272 724 : break;
4273 :
4274 5086 : case XLOG_FROM_PG_WAL:
4275 : case XLOG_FROM_STREAM:
4276 5086 : XLogFilePath(path, tli, segno, wal_segment_size);
4277 5086 : break;
4278 :
4279 0 : default:
4280 0 : elog(ERROR, "invalid XLogFileRead source %d", source);
4281 : }
4282 :
4283 : /*
4284 : * If the segment was fetched from archival storage, replace the existing
4285 : * xlog segment (if any) with the archival version.
4286 : */
4287 5810 : if (source == XLOG_FROM_ARCHIVE)
4288 : {
4289 : Assert(!IsInstallXLogFileSegmentActive());
4290 724 : KeepFileRestoredFromArchive(path, xlogfname);
4291 :
4292 : /*
4293 : * Set path to point at the new file in pg_wal.
4294 : */
4295 724 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4296 : }
4297 :
4298 5810 : fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4299 5810 : if (fd >= 0)
4300 : {
4301 : /* Success! */
4302 5458 : curFileTLI = tli;
4303 :
4304 : /* Report recovery progress in PS display */
4305 5458 : snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4306 : xlogfname);
4307 5458 : set_ps_display(activitymsg);
4308 :
4309 : /* Track source of data in assorted state variables */
4310 5458 : readSource = source;
4311 5458 : XLogReceiptSource = source;
4312 : /* In FROM_STREAM case, caller tracks receipt time, not me */
4313 5458 : if (source != XLOG_FROM_STREAM)
4314 3170 : XLogReceiptTime = GetCurrentTimestamp();
4315 :
4316 5458 : return fd;
4317 : }
4318 352 : if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4319 0 : ereport(PANIC,
4320 : (errcode_for_file_access(),
4321 : errmsg("could not open file \"%s\": %m", path)));
4322 352 : return -1;
4323 : }
4324 :
4325 : /*
4326 : * Open a logfile segment for reading (during recovery).
4327 : *
4328 : * This version searches for the segment with any TLI listed in expectedTLEs.
4329 : */
4330 : static int
4331 3506 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4332 : {
4333 : char path[MAXPGPATH];
4334 : ListCell *cell;
4335 : int fd;
4336 : List *tles;
4337 :
4338 : /*
4339 : * Loop looking for a suitable timeline ID: we might need to read any of
4340 : * the timelines listed in expectedTLEs.
4341 : *
4342 : * We expect curFileTLI on entry to be the TLI of the preceding file in
4343 : * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4344 : * to go backwards; this prevents us from picking up the wrong file when a
4345 : * parent timeline extends to higher segment numbers than the child we
4346 : * want to read.
4347 : *
4348 : * If we haven't read the timeline history file yet, read it now, so that
4349 : * we know which TLIs to scan. We don't save the list in expectedTLEs,
4350 : * however, unless we actually find a valid segment. That way if there is
4351 : * neither a timeline history file nor a WAL segment in the archive, and
4352 : * streaming replication is set up, we'll read the timeline history file
4353 : * streamed from the primary when we start streaming, instead of
4354 : * recovering with a dummy history generated here.
4355 : */
4356 3506 : if (expectedTLEs)
4357 1590 : tles = expectedTLEs;
4358 : else
4359 1916 : tles = readTimeLineHistory(recoveryTargetTLI);
4360 :
4361 3872 : foreach(cell, tles)
4362 : {
4363 3544 : TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4364 3544 : TimeLineID tli = hent->tli;
4365 :
4366 3544 : if (tli < curFileTLI)
4367 8 : break; /* don't bother looking at too-old TLIs */
4368 :
4369 : /*
4370 : * Skip scanning the timeline ID that the logfile segment to read
4371 : * doesn't belong to
4372 : */
4373 3536 : if (XLogRecPtrIsValid(hent->begin))
4374 : {
4375 146 : XLogSegNo beginseg = 0;
4376 :
4377 146 : XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4378 :
4379 : /*
4380 : * The logfile segment that doesn't belong to the timeline is
4381 : * older or newer than the segment that the timeline started or
4382 : * ended at, respectively. It's sufficient to check only the
4383 : * starting segment of the timeline here. Since the timelines are
4384 : * scanned in descending order in this loop, any segments newer
4385 : * than the ending segment should belong to newer timeline and
4386 : * have already been read before. So it's not necessary to check
4387 : * the ending segment of the timeline here.
4388 : */
4389 146 : if (segno < beginseg)
4390 14 : continue;
4391 : }
4392 :
4393 3522 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4394 : {
4395 1608 : fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4396 1608 : if (fd != -1)
4397 : {
4398 724 : elog(DEBUG1, "got WAL segment from archive");
4399 724 : if (!expectedTLEs)
4400 36 : expectedTLEs = tles;
4401 3170 : return fd;
4402 : }
4403 : }
4404 :
4405 2798 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4406 : {
4407 2798 : fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4408 2798 : if (fd != -1)
4409 : {
4410 2446 : if (!expectedTLEs)
4411 1880 : expectedTLEs = tles;
4412 2446 : return fd;
4413 : }
4414 : }
4415 : }
4416 :
4417 : /* Couldn't find it. For simplicity, complain about front timeline */
4418 336 : XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4419 336 : errno = ENOENT;
4420 336 : ereport(DEBUG2,
4421 : (errcode_for_file_access(),
4422 : errmsg("could not open file \"%s\": %m", path)));
4423 336 : return -1;
4424 : }
4425 :
4426 : /*
4427 : * Set flag to signal the walreceiver to restart. (The startup process calls
4428 : * this on noticing a relevant configuration change.)
4429 : */
4430 : void
4431 10 : StartupRequestWalReceiverRestart(void)
4432 : {
4433 10 : if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4434 : {
4435 8 : ereport(LOG,
4436 : (errmsg("WAL receiver process shutdown requested")));
4437 :
4438 8 : pendingWalRcvRestart = true;
4439 : }
4440 10 : }
4441 :
4442 :
4443 : /*
4444 : * Has a standby promotion already been triggered?
4445 : *
4446 : * Unlike CheckForStandbyTrigger(), this works in any process
4447 : * that's connected to shared memory.
4448 : */
4449 : bool
4450 124 : PromoteIsTriggered(void)
4451 : {
4452 : /*
4453 : * We check shared state each time only until a standby promotion is
4454 : * triggered. We can't trigger a promotion again, so there's no need to
4455 : * keep checking after the shared variable has once been seen true.
4456 : */
4457 124 : if (LocalPromoteIsTriggered)
4458 90 : return true;
4459 :
4460 34 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4461 34 : LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4462 34 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4463 :
4464 34 : return LocalPromoteIsTriggered;
4465 : }
4466 :
4467 : static void
4468 88 : SetPromoteIsTriggered(void)
4469 : {
4470 88 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4471 88 : XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4472 88 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4473 :
4474 : /*
4475 : * Mark the recovery pause state as 'not paused' because the paused state
4476 : * ends and promotion continues if a promotion is triggered while recovery
4477 : * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4478 : * return 'paused' while a promotion is ongoing.
4479 : */
4480 88 : SetRecoveryPause(false);
4481 :
4482 88 : LocalPromoteIsTriggered = true;
4483 88 : }
4484 :
4485 : /*
4486 : * Check whether a promote request has arrived.
4487 : */
4488 : static bool
4489 12804 : CheckForStandbyTrigger(void)
4490 : {
4491 12804 : if (LocalPromoteIsTriggered)
4492 106 : return true;
4493 :
4494 12698 : if (IsPromoteSignaled() && CheckPromoteSignal())
4495 : {
4496 88 : ereport(LOG, (errmsg("received promote request")));
4497 88 : RemovePromoteSignalFiles();
4498 88 : ResetPromoteSignaled();
4499 88 : SetPromoteIsTriggered();
4500 88 : return true;
4501 : }
4502 :
4503 12610 : return false;
4504 : }
4505 :
4506 : /*
4507 : * Remove the files signaling a standby promotion request.
4508 : */
4509 : void
4510 1840 : RemovePromoteSignalFiles(void)
4511 : {
4512 1840 : unlink(PROMOTE_SIGNAL_FILE);
4513 1840 : }
4514 :
4515 : /*
4516 : * Check to see if a promote request has arrived.
4517 : */
4518 : bool
4519 1668 : CheckPromoteSignal(void)
4520 : {
4521 : struct stat stat_buf;
4522 :
4523 1668 : if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4524 176 : return true;
4525 :
4526 1492 : return false;
4527 : }
4528 :
4529 : /*
4530 : * Wake up startup process to replay newly arrived WAL, or to notice that
4531 : * failover has been requested.
4532 : */
4533 : void
4534 52472 : WakeupRecovery(void)
4535 : {
4536 52472 : SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4537 52472 : }
4538 :
4539 : /*
4540 : * Schedule a walreceiver wakeup in the main recovery loop.
4541 : */
4542 : void
4543 4 : XLogRequestWalReceiverReply(void)
4544 : {
4545 4 : doRequestWalReceiverReply = true;
4546 4 : }
4547 :
4548 : /*
4549 : * Is HotStandby active yet? This is only important in special backends
4550 : * since normal backends won't ever be able to connect until this returns
4551 : * true. Postmaster knows this by way of signal, not via shared memory.
4552 : *
4553 : * Unlike testing standbyState, this works in any process that's connected to
4554 : * shared memory. (And note that standbyState alone doesn't tell the truth
4555 : * anyway.)
4556 : */
4557 : bool
4558 300 : HotStandbyActive(void)
4559 : {
4560 : /*
4561 : * We check shared state each time only until Hot Standby is active. We
4562 : * can't de-activate Hot Standby, so there's no need to keep checking
4563 : * after the shared variable has once been seen true.
4564 : */
4565 300 : if (LocalHotStandbyActive)
4566 48 : return true;
4567 : else
4568 : {
4569 : /* spinlock is essential on machines with weak memory ordering! */
4570 252 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4571 252 : LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4572 252 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4573 :
4574 252 : return LocalHotStandbyActive;
4575 : }
4576 : }
4577 :
4578 : /*
4579 : * Like HotStandbyActive(), but to be used only in WAL replay code,
4580 : * where we don't need to ask any other process what the state is.
4581 : */
4582 : static bool
4583 0 : HotStandbyActiveInReplay(void)
4584 : {
4585 : Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4586 0 : return LocalHotStandbyActive;
4587 : }
4588 :
4589 : /*
4590 : * Get latest redo apply position.
4591 : *
4592 : * Exported to allow WALReceiver to read the pointer directly.
4593 : */
4594 : XLogRecPtr
4595 139166 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
4596 : {
4597 : XLogRecPtr recptr;
4598 : TimeLineID tli;
4599 :
4600 139166 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4601 139166 : recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4602 139166 : tli = XLogRecoveryCtl->lastReplayedTLI;
4603 139166 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4604 :
4605 139166 : if (replayTLI)
4606 5122 : *replayTLI = tli;
4607 139166 : return recptr;
4608 : }
4609 :
4610 :
4611 : /*
4612 : * Get position of last applied, or the record being applied.
4613 : *
4614 : * This is different from GetXLogReplayRecPtr() in that if a WAL
4615 : * record is currently being applied, this includes that record.
4616 : */
4617 : XLogRecPtr
4618 12330 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4619 : {
4620 : XLogRecPtr recptr;
4621 : TimeLineID tli;
4622 :
4623 12330 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4624 12330 : recptr = XLogRecoveryCtl->replayEndRecPtr;
4625 12330 : tli = XLogRecoveryCtl->replayEndTLI;
4626 12330 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4627 :
4628 12330 : if (replayEndTLI)
4629 12330 : *replayEndTLI = tli;
4630 12330 : return recptr;
4631 : }
4632 :
4633 : /*
4634 : * Save timestamp of latest processed commit/abort record.
4635 : *
4636 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4637 : * seen by processes other than the startup process. Note in particular
4638 : * that CreateRestartPoint is executed in the checkpointer.
4639 : */
4640 : static void
4641 43260 : SetLatestXTime(TimestampTz xtime)
4642 : {
4643 43260 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4644 43260 : XLogRecoveryCtl->recoveryLastXTime = xtime;
4645 43260 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4646 43260 : }
4647 :
4648 : /*
4649 : * Fetch timestamp of latest processed commit/abort record.
4650 : */
4651 : TimestampTz
4652 686 : GetLatestXTime(void)
4653 : {
4654 : TimestampTz xtime;
4655 :
4656 686 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4657 686 : xtime = XLogRecoveryCtl->recoveryLastXTime;
4658 686 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4659 :
4660 686 : return xtime;
4661 : }
4662 :
4663 : /*
4664 : * Save timestamp of the next chunk of WAL records to apply.
4665 : *
4666 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4667 : * seen by all backends.
4668 : */
4669 : static void
4670 8760 : SetCurrentChunkStartTime(TimestampTz xtime)
4671 : {
4672 8760 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4673 8760 : XLogRecoveryCtl->currentChunkStartTime = xtime;
4674 8760 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4675 8760 : }
4676 :
4677 : /*
4678 : * Fetch timestamp of latest processed commit/abort record.
4679 : * Startup process maintains an accurate local copy in XLogReceiptTime
4680 : */
4681 : TimestampTz
4682 474 : GetCurrentChunkReplayStartTime(void)
4683 : {
4684 : TimestampTz xtime;
4685 :
4686 474 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4687 474 : xtime = XLogRecoveryCtl->currentChunkStartTime;
4688 474 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4689 :
4690 474 : return xtime;
4691 : }
4692 :
4693 : /*
4694 : * Returns time of receipt of current chunk of XLOG data, as well as
4695 : * whether it was received from streaming replication or from archives.
4696 : */
4697 : void
4698 56 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4699 : {
4700 : /*
4701 : * This must be executed in the startup process, since we don't export the
4702 : * relevant state to shared memory.
4703 : */
4704 : Assert(InRecovery);
4705 :
4706 56 : *rtime = XLogReceiptTime;
4707 56 : *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4708 56 : }
4709 :
4710 : /*
4711 : * Note that text field supplied is a parameter name and does not require
4712 : * translation
4713 : */
4714 : void
4715 1240 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4716 : {
4717 1240 : if (currValue < minValue)
4718 : {
4719 0 : if (HotStandbyActiveInReplay())
4720 : {
4721 0 : bool warned_for_promote = false;
4722 :
4723 0 : ereport(WARNING,
4724 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4725 : errmsg("hot standby is not possible because of insufficient parameter settings"),
4726 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4727 : param_name,
4728 : currValue,
4729 : minValue)));
4730 :
4731 0 : SetRecoveryPause(true);
4732 :
4733 0 : ereport(LOG,
4734 : (errmsg("recovery has paused"),
4735 : errdetail("If recovery is unpaused, the server will shut down."),
4736 : errhint("You can then restart the server after making the necessary configuration changes.")));
4737 :
4738 0 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4739 : {
4740 0 : ProcessStartupProcInterrupts();
4741 :
4742 0 : if (CheckForStandbyTrigger())
4743 : {
4744 0 : if (!warned_for_promote)
4745 0 : ereport(WARNING,
4746 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4747 : errmsg("promotion is not possible because of insufficient parameter settings"),
4748 :
4749 : /*
4750 : * Repeat the detail from above so it's easy to find
4751 : * in the log.
4752 : */
4753 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4754 : param_name,
4755 : currValue,
4756 : minValue),
4757 : errhint("Restart the server after making the necessary configuration changes.")));
4758 0 : warned_for_promote = true;
4759 : }
4760 :
4761 : /*
4762 : * If recovery pause is requested then set it paused. While
4763 : * we are in the loop, user might resume and pause again so
4764 : * set this every time.
4765 : */
4766 0 : ConfirmRecoveryPaused();
4767 :
4768 : /*
4769 : * We wait on a condition variable that will wake us as soon
4770 : * as the pause ends, but we use a timeout so we can check the
4771 : * above conditions periodically too.
4772 : */
4773 0 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4774 : WAIT_EVENT_RECOVERY_PAUSE);
4775 : }
4776 0 : ConditionVariableCancelSleep();
4777 : }
4778 :
4779 0 : ereport(FATAL,
4780 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4781 : errmsg("recovery aborted because of insufficient parameter settings"),
4782 : /* Repeat the detail from above so it's easy to find in the log. */
4783 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4784 : param_name,
4785 : currValue,
4786 : minValue),
4787 : errhint("You can restart the server after making the necessary configuration changes.")));
4788 : }
4789 1240 : }
4790 :
4791 :
4792 : /*
4793 : * GUC check_hook for primary_slot_name
4794 : */
4795 : bool
4796 2668 : check_primary_slot_name(char **newval, void **extra, GucSource source)
4797 : {
4798 : int err_code;
4799 2668 : char *err_msg = NULL;
4800 2668 : char *err_hint = NULL;
4801 :
4802 2668 : if (*newval && strcmp(*newval, "") != 0 &&
4803 390 : !ReplicationSlotValidateNameInternal(*newval, false, &err_code,
4804 : &err_msg, &err_hint))
4805 : {
4806 0 : GUC_check_errcode(err_code);
4807 0 : GUC_check_errdetail("%s", err_msg);
4808 0 : if (err_hint != NULL)
4809 0 : GUC_check_errhint("%s", err_hint);
4810 0 : return false;
4811 : }
4812 :
4813 2668 : return true;
4814 : }
4815 :
4816 : /*
4817 : * Recovery target settings: Only one of the several recovery_target* settings
4818 : * may be set. Setting a second one results in an error. The global variable
4819 : * recoveryTarget tracks which kind of recovery target was chosen. Other
4820 : * variables store the actual target value (for example a string or a xid).
4821 : * The assign functions of the parameters check whether a competing parameter
4822 : * was already set. But we want to allow setting the same parameter multiple
4823 : * times. We also want to allow unsetting a parameter and setting a different
4824 : * one, so we unset recoveryTarget when the parameter is set to an empty
4825 : * string.
4826 : *
4827 : * XXX this code is broken by design. Throwing an error from a GUC assign
4828 : * hook breaks fundamental assumptions of guc.c. So long as all the variables
4829 : * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4830 : * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4831 : * that we have odd behaviors such as unexpected GUC ordering dependencies.
4832 : */
4833 :
4834 : pg_noreturn static void
4835 2 : error_multiple_recovery_targets(void)
4836 : {
4837 2 : ereport(ERROR,
4838 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4839 : errmsg("multiple recovery targets specified"),
4840 : errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4841 : }
4842 :
4843 : /*
4844 : * GUC check_hook for recovery_target
4845 : */
4846 : bool
4847 2280 : check_recovery_target(char **newval, void **extra, GucSource source)
4848 : {
4849 2280 : if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4850 : {
4851 0 : GUC_check_errdetail("The only allowed value is \"immediate\".");
4852 0 : return false;
4853 : }
4854 2280 : return true;
4855 : }
4856 :
4857 : /*
4858 : * GUC assign_hook for recovery_target
4859 : */
4860 : void
4861 2280 : assign_recovery_target(const char *newval, void *extra)
4862 : {
4863 2280 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4864 0 : recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4865 0 : error_multiple_recovery_targets();
4866 :
4867 2280 : if (newval && strcmp(newval, "") != 0)
4868 2 : recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4869 : else
4870 2278 : recoveryTarget = RECOVERY_TARGET_UNSET;
4871 2280 : }
4872 :
4873 : /*
4874 : * GUC check_hook for recovery_target_lsn
4875 : */
4876 : bool
4877 2292 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4878 : {
4879 2292 : if (strcmp(*newval, "") != 0)
4880 : {
4881 : XLogRecPtr lsn;
4882 : XLogRecPtr *myextra;
4883 18 : ErrorSaveContext escontext = {T_ErrorSaveContext};
4884 :
4885 18 : lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
4886 18 : if (escontext.error_occurred)
4887 0 : return false;
4888 :
4889 18 : myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4890 18 : if (!myextra)
4891 0 : return false;
4892 18 : *myextra = lsn;
4893 18 : *extra = myextra;
4894 : }
4895 2292 : return true;
4896 : }
4897 :
4898 : /*
4899 : * GUC assign_hook for recovery_target_lsn
4900 : */
4901 : void
4902 2292 : assign_recovery_target_lsn(const char *newval, void *extra)
4903 : {
4904 2292 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4905 0 : recoveryTarget != RECOVERY_TARGET_LSN)
4906 0 : error_multiple_recovery_targets();
4907 :
4908 2292 : if (newval && strcmp(newval, "") != 0)
4909 : {
4910 18 : recoveryTarget = RECOVERY_TARGET_LSN;
4911 18 : recoveryTargetLSN = *((XLogRecPtr *) extra);
4912 : }
4913 : else
4914 2274 : recoveryTarget = RECOVERY_TARGET_UNSET;
4915 2292 : }
4916 :
4917 : /*
4918 : * GUC check_hook for recovery_target_name
4919 : */
4920 : bool
4921 2292 : check_recovery_target_name(char **newval, void **extra, GucSource source)
4922 : {
4923 : /* Use the value of newval directly */
4924 2292 : if (strlen(*newval) >= MAXFNAMELEN)
4925 : {
4926 0 : GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4927 : "recovery_target_name", MAXFNAMELEN - 1);
4928 0 : return false;
4929 : }
4930 2292 : return true;
4931 : }
4932 :
4933 : /*
4934 : * GUC assign_hook for recovery_target_name
4935 : */
4936 : void
4937 2292 : assign_recovery_target_name(const char *newval, void *extra)
4938 : {
4939 2292 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4940 0 : recoveryTarget != RECOVERY_TARGET_NAME)
4941 0 : error_multiple_recovery_targets();
4942 :
4943 2292 : if (newval && strcmp(newval, "") != 0)
4944 : {
4945 12 : recoveryTarget = RECOVERY_TARGET_NAME;
4946 12 : recoveryTargetName = newval;
4947 : }
4948 : else
4949 2280 : recoveryTarget = RECOVERY_TARGET_UNSET;
4950 2292 : }
4951 :
4952 : /*
4953 : * GUC check_hook for recovery_target_time
4954 : *
4955 : * The interpretation of the recovery_target_time string can depend on the
4956 : * time zone setting, so we need to wait until after all GUC processing is
4957 : * done before we can do the final parsing of the string. This check function
4958 : * only does a parsing pass to catch syntax errors, but we store the string
4959 : * and parse it again when we need to use it.
4960 : */
4961 : bool
4962 2284 : check_recovery_target_time(char **newval, void **extra, GucSource source)
4963 : {
4964 2284 : if (strcmp(*newval, "") != 0)
4965 : {
4966 : /* reject some special values */
4967 6 : if (strcmp(*newval, "now") == 0 ||
4968 6 : strcmp(*newval, "today") == 0 ||
4969 6 : strcmp(*newval, "tomorrow") == 0 ||
4970 6 : strcmp(*newval, "yesterday") == 0)
4971 : {
4972 0 : return false;
4973 : }
4974 :
4975 : /*
4976 : * parse timestamp value (see also timestamptz_in())
4977 : */
4978 : {
4979 6 : char *str = *newval;
4980 : fsec_t fsec;
4981 : struct pg_tm tt,
4982 6 : *tm = &tt;
4983 : int tz;
4984 : int dtype;
4985 : int nf;
4986 : int dterr;
4987 : char *field[MAXDATEFIELDS];
4988 : int ftype[MAXDATEFIELDS];
4989 : char workbuf[MAXDATELEN + MAXDATEFIELDS];
4990 : DateTimeErrorExtra dtextra;
4991 : TimestampTz timestamp;
4992 :
4993 6 : dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4994 : field, ftype, MAXDATEFIELDS, &nf);
4995 6 : if (dterr == 0)
4996 6 : dterr = DecodeDateTime(field, ftype, nf,
4997 : &dtype, tm, &fsec, &tz, &dtextra);
4998 6 : if (dterr != 0)
4999 0 : return false;
5000 6 : if (dtype != DTK_DATE)
5001 0 : return false;
5002 :
5003 6 : if (tm2timestamp(tm, fsec, &tz, ×tamp) != 0)
5004 : {
5005 0 : GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
5006 0 : return false;
5007 : }
5008 : }
5009 : }
5010 2284 : return true;
5011 : }
5012 :
5013 : /*
5014 : * GUC assign_hook for recovery_target_time
5015 : */
5016 : void
5017 2284 : assign_recovery_target_time(const char *newval, void *extra)
5018 : {
5019 2284 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5020 2 : recoveryTarget != RECOVERY_TARGET_TIME)
5021 2 : error_multiple_recovery_targets();
5022 :
5023 2282 : if (newval && strcmp(newval, "") != 0)
5024 4 : recoveryTarget = RECOVERY_TARGET_TIME;
5025 : else
5026 2278 : recoveryTarget = RECOVERY_TARGET_UNSET;
5027 2282 : }
5028 :
5029 : /*
5030 : * GUC check_hook for recovery_target_timeline
5031 : */
5032 : bool
5033 2286 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
5034 : {
5035 : RecoveryTargetTimeLineGoal rttg;
5036 : RecoveryTargetTimeLineGoal *myextra;
5037 :
5038 2286 : if (strcmp(*newval, "current") == 0)
5039 0 : rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
5040 2286 : else if (strcmp(*newval, "latest") == 0)
5041 2280 : rttg = RECOVERY_TARGET_TIMELINE_LATEST;
5042 : else
5043 : {
5044 : char *endp;
5045 : uint64 timeline;
5046 :
5047 6 : rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
5048 :
5049 6 : errno = 0;
5050 6 : timeline = strtou64(*newval, &endp, 0);
5051 :
5052 6 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5053 : {
5054 2 : GUC_check_errdetail("\"%s\" is not a valid number.",
5055 : "recovery_target_timeline");
5056 6 : return false;
5057 : }
5058 :
5059 4 : if (timeline < 1 || timeline > PG_UINT32_MAX)
5060 : {
5061 4 : GUC_check_errdetail("\"%s\" must be between %u and %u.",
5062 : "recovery_target_timeline", 1, UINT_MAX);
5063 4 : return false;
5064 : }
5065 : }
5066 :
5067 2280 : myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal));
5068 2280 : if (!myextra)
5069 0 : return false;
5070 2280 : *myextra = rttg;
5071 2280 : *extra = myextra;
5072 :
5073 2280 : return true;
5074 : }
5075 :
5076 : /*
5077 : * GUC assign_hook for recovery_target_timeline
5078 : */
5079 : void
5080 2280 : assign_recovery_target_timeline(const char *newval, void *extra)
5081 : {
5082 2280 : recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5083 2280 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5084 0 : recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5085 : else
5086 2280 : recoveryTargetTLIRequested = 0;
5087 2280 : }
5088 :
5089 : /*
5090 : * GUC check_hook for recovery_target_xid
5091 : */
5092 : bool
5093 2280 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
5094 : {
5095 2280 : if (strcmp(*newval, "") != 0)
5096 : {
5097 : TransactionId xid;
5098 : TransactionId *myextra;
5099 :
5100 2 : errno = 0;
5101 2 : xid = (TransactionId) strtou64(*newval, NULL, 0);
5102 2 : if (errno == EINVAL || errno == ERANGE)
5103 0 : return false;
5104 :
5105 2 : myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5106 2 : if (!myextra)
5107 0 : return false;
5108 2 : *myextra = xid;
5109 2 : *extra = myextra;
5110 : }
5111 2280 : return true;
5112 : }
5113 :
5114 : /*
5115 : * GUC assign_hook for recovery_target_xid
5116 : */
5117 : void
5118 2280 : assign_recovery_target_xid(const char *newval, void *extra)
5119 : {
5120 2280 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5121 0 : recoveryTarget != RECOVERY_TARGET_XID)
5122 0 : error_multiple_recovery_targets();
5123 :
5124 2280 : if (newval && strcmp(newval, "") != 0)
5125 : {
5126 2 : recoveryTarget = RECOVERY_TARGET_XID;
5127 2 : recoveryTargetXid = *((TransactionId *) extra);
5128 : }
5129 : else
5130 2278 : recoveryTarget = RECOVERY_TARGET_UNSET;
5131 2280 : }
|